git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  *
  20  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  21  */
  22
  23 /**
  24  * @file dsputil.c
  25  * DSP utils
  26  */
  27
  28 #include "avcodec.h"
  29 #include "dsputil.h"
  30 #include "mpegvideo.h"
  31 #include "simple_idct.h"
  32 #include "faandct.h"
  33
  34 /* snow.c */
  35 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  36
  37 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  38 uint32_t squareTbl[512] = {0, };
  39
  40 const uint8_t ff_zigzag_direct[64] = {
  41     0,   1,  8, 16,  9,  2,  3, 10,
  42     17, 24, 32, 25, 18, 11,  4,  5,
  43     12, 19, 26, 33, 40, 48, 41, 34,
  44     27, 20, 13,  6,  7, 14, 21, 28,
  45     35, 42, 49, 56, 57, 50, 43, 36,
  46     29, 22, 15, 23, 30, 37, 44, 51,
  47     58, 59, 52, 45, 38, 31, 39, 46,
  48     53, 60, 61, 54, 47, 55, 62, 63
  49 };
  50
  51 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  52    specification, we interleave the fields */
  53 const uint8_t ff_zigzag248_direct[64] = {
  54      0,  8,  1,  9, 16, 24,  2, 10,
  55     17, 25, 32, 40, 48, 56, 33, 41,
  56     18, 26,  3, 11,  4, 12, 19, 27,
  57     34, 42, 49, 57, 50, 58, 35, 43,
  58     20, 28,  5, 13,  6, 14, 21, 29,
  59     36, 44, 51, 59, 52, 60, 37, 45,
  60     22, 30,  7, 15, 23, 31, 38, 46,
  61     53, 61, 54, 62, 39, 47, 55, 63,
  62 };
  63
  64 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  65 uint16_t __align8 inv_zigzag_direct16[64] = {0, };
  66
  67 const uint8_t ff_alternate_horizontal_scan[64] = {
  68     0,  1,   2,  3,  8,  9, 16, 17,
  69     10, 11,  4,  5,  6,  7, 15, 14,
  70     13, 12, 19, 18, 24, 25, 32, 33,
  71     26, 27, 20, 21, 22, 23, 28, 29,
  72     30, 31, 34, 35, 40, 41, 48, 49,
  73     42, 43, 36, 37, 38, 39, 44, 45,
  74     46, 47, 50, 51, 56, 57, 58, 59,
  75     52, 53, 54, 55, 60, 61, 62, 63,
  76 };
  77
  78 const uint8_t ff_alternate_vertical_scan[64] = {
  79     0,  8,  16, 24,  1,  9,  2, 10,
  80     17, 25, 32, 40, 48, 56, 57, 49,
  81     41, 33, 26, 18,  3, 11,  4, 12,
  82     19, 27, 34, 42, 50, 58, 35, 43,
  83     51, 59, 20, 28,  5, 13,  6, 14,
  84     21, 29, 36, 44, 52, 60, 37, 45,
  85     53, 61, 22, 30,  7, 15, 23, 31,
  86     38, 46, 54, 62, 39, 47, 55, 63,
  87 };
  88
  89 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  90 const uint32_t inverse[256]={
  91          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  92  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
  93  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
  94  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
  95  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
  96  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
  97   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
  98   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
  99   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 100   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 101   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 102   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 103   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 104   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 105   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 106   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 107   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 108   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 109   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 110   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 111   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 112   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 113   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 114   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 115   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 116   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 117   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 118   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 119   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 120   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 121   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 122   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 123 };
 124
 125 /* Input permutation for the simple_idct_mmx */
 126 static const uint8_t simple_mmx_permutation[64]={
 127         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 128         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 129         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 130         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 131         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 132         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 133         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 134         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 135 };
 136
 137 static int pix_sum_c(uint8_t * pix, int line_size)
 138 {
 139     int s, i, j;
 140
 141     s = 0;
 142     for (i = 0; i < 16; i++) {
 143         for (j = 0; j < 16; j += 8) {
 144             s += pix[0];
 145             s += pix[1];
 146             s += pix[2];
 147             s += pix[3];
 148             s += pix[4];
 149             s += pix[5];
 150             s += pix[6];
 151             s += pix[7];
 152             pix += 8;
 153         }
 154         pix += line_size - 16;
 155     }
 156     return s;
 157 }
 158
 159 static int pix_norm1_c(uint8_t * pix, int line_size)
 160 {
 161     int s, i, j;
 162     uint32_t *sq = squareTbl + 256;
 163
 164     s = 0;
 165     for (i = 0; i < 16; i++) {
 166         for (j = 0; j < 16; j += 8) {
 167 #if 0
 168             s += sq[pix[0]];
 169             s += sq[pix[1]];
 170             s += sq[pix[2]];
 171             s += sq[pix[3]];
 172             s += sq[pix[4]];
 173             s += sq[pix[5]];
 174             s += sq[pix[6]];
 175             s += sq[pix[7]];
 176 #else
 177 #if LONG_MAX > 2147483647
 178             register uint64_t x=*(uint64_t*)pix;
 179             s += sq[x&0xff];
 180             s += sq[(x>>8)&0xff];
 181             s += sq[(x>>16)&0xff];
 182             s += sq[(x>>24)&0xff];
 183             s += sq[(x>>32)&0xff];
 184             s += sq[(x>>40)&0xff];
 185             s += sq[(x>>48)&0xff];
 186             s += sq[(x>>56)&0xff];
 187 #else
 188             register uint32_t x=*(uint32_t*)pix;
 189             s += sq[x&0xff];
 190             s += sq[(x>>8)&0xff];
 191             s += sq[(x>>16)&0xff];
 192             s += sq[(x>>24)&0xff];
 193             x=*(uint32_t*)(pix+4);
 194             s += sq[x&0xff];
 195             s += sq[(x>>8)&0xff];
 196             s += sq[(x>>16)&0xff];
 197             s += sq[(x>>24)&0xff];
 198 #endif
 199 #endif
 200             pix += 8;
 201         }
 202         pix += line_size - 16;
 203     }
 204     return s;
 205 }
 206
 207 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 208     int i;
 209
 210     for(i=0; i+8<=w; i+=8){
 211         dst[i+0]= bswap_32(src[i+0]);
 212         dst[i+1]= bswap_32(src[i+1]);
 213         dst[i+2]= bswap_32(src[i+2]);
 214         dst[i+3]= bswap_32(src[i+3]);
 215         dst[i+4]= bswap_32(src[i+4]);
 216         dst[i+5]= bswap_32(src[i+5]);
 217         dst[i+6]= bswap_32(src[i+6]);
 218         dst[i+7]= bswap_32(src[i+7]);
 219     }
 220     for(;i<w; i++){
 221         dst[i+0]= bswap_32(src[i+0]);
 222     }
 223 }
 224
 225 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 226 {
 227     int s, i;
 228     uint32_t *sq = squareTbl + 256;
 229
 230     s = 0;
 231     for (i = 0; i < h; i++) {
 232         s += sq[pix1[0] - pix2[0]];
 233         s += sq[pix1[1] - pix2[1]];
 234         s += sq[pix1[2] - pix2[2]];
 235         s += sq[pix1[3] - pix2[3]];
 236         pix1 += line_size;
 237         pix2 += line_size;
 238     }
 239     return s;
 240 }
 241
 242 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 243 {
 244     int s, i;
 245     uint32_t *sq = squareTbl + 256;
 246
 247     s = 0;
 248     for (i = 0; i < h; i++) {
 249         s += sq[pix1[0] - pix2[0]];
 250         s += sq[pix1[1] - pix2[1]];
 251         s += sq[pix1[2] - pix2[2]];
 252         s += sq[pix1[3] - pix2[3]];
 253         s += sq[pix1[4] - pix2[4]];
 254         s += sq[pix1[5] - pix2[5]];
 255         s += sq[pix1[6] - pix2[6]];
 256         s += sq[pix1[7] - pix2[7]];
 257         pix1 += line_size;
 258         pix2 += line_size;
 259     }
 260     return s;
 261 }
 262
 263 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 264 {
 265     int s, i;
 266     uint32_t *sq = squareTbl + 256;
 267
 268     s = 0;
 269     for (i = 0; i < h; i++) {
 270         s += sq[pix1[ 0] - pix2[ 0]];
 271         s += sq[pix1[ 1] - pix2[ 1]];
 272         s += sq[pix1[ 2] - pix2[ 2]];
 273         s += sq[pix1[ 3] - pix2[ 3]];
 274         s += sq[pix1[ 4] - pix2[ 4]];
 275         s += sq[pix1[ 5] - pix2[ 5]];
 276         s += sq[pix1[ 6] - pix2[ 6]];
 277         s += sq[pix1[ 7] - pix2[ 7]];
 278         s += sq[pix1[ 8] - pix2[ 8]];
 279         s += sq[pix1[ 9] - pix2[ 9]];
 280         s += sq[pix1[10] - pix2[10]];
 281         s += sq[pix1[11] - pix2[11]];
 282         s += sq[pix1[12] - pix2[12]];
 283         s += sq[pix1[13] - pix2[13]];
 284         s += sq[pix1[14] - pix2[14]];
 285         s += sq[pix1[15] - pix2[15]];
 286
 287         pix1 += line_size;
 288         pix2 += line_size;
 289     }
 290     return s;
 291 }
 292
 293
 294 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 295 #ifdef CONFIG_SNOW_ENCODER //idwt is in snow.c
 296     int s, i, j;
 297     const int dec_count= w==8 ? 3 : 4;
 298     int tmp[16*16];
 299 #if 0
 300     int level, ori;
 301     static const int scale[2][2][4][4]={
 302       {
 303         {
 304             //8x8 dec=3
 305             {268, 239, 239, 213},
 306             {  0, 224, 224, 152},
 307             {  0, 135, 135, 110},
 308         },{
 309             //16x16 dec=4
 310             {344, 310, 310, 280},
 311             {  0, 320, 320, 228},
 312             {  0, 175, 175, 136},
 313             {  0, 129, 129, 102},
 314         }
 315       },{
 316         {//FIXME 5/3
 317             //8x8 dec=3
 318             {275, 245, 245, 218},
 319             {  0, 230, 230, 156},
 320             {  0, 138, 138, 113},
 321         },{
 322             //16x16 dec=4
 323             {352, 317, 317, 286},
 324             {  0, 328, 328, 233},
 325             {  0, 180, 180, 140},
 326             {  0, 132, 132, 105},
 327         }
 328       }
 329     };
 330 #endif
 331
 332     for (i = 0; i < h; i++) {
 333         for (j = 0; j < w; j+=4) {
 334             tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 335             tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 336             tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 337             tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 338         }
 339         pix1 += line_size;
 340         pix2 += line_size;
 341     }
 342
 343     ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
 344
 345     s=0;
 346 #if 0
 347     for(level=0; level<dec_count; level++){
 348         for(ori= level ? 1 : 0; ori<4; ori++){
 349             int sx= (ori&1) ? 1<<level: 0;
 350             int stride= 16<<(dec_count-level);
 351             int sy= (ori&2) ? stride>>1 : 0;
 352             int size= 1<<level;
 353
 354             for(i=0; i<size; i++){
 355                 for(j=0; j<size; j++){
 356                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 357                     s += ABS(v);
 358                 }
 359             }
 360         }
 361     }
 362 #endif
 363     for (i = 0; i < h; i++) {
 364         for (j = 0; j < w; j+=4) {
 365             s+= ABS(tmp[16*i+j+0]);
 366             s+= ABS(tmp[16*i+j+1]);
 367             s+= ABS(tmp[16*i+j+2]);
 368             s+= ABS(tmp[16*i+j+3]);
 369         }
 370     }
 371     assert(s>=0);
 372
 373     return s>>2;
 374 #endif
 375 }
 376
 377 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 378     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 379 }
 380
 381 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 382     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 383 }
 384
 385 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 386     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 387 }
 388
 389 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 390     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 391 }
 392
 393 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 394 {
 395     int i;
 396
 397     /* read the pixels */
 398     for(i=0;i<8;i++) {
 399         block[0] = pixels[0];
 400         block[1] = pixels[1];
 401         block[2] = pixels[2];
 402         block[3] = pixels[3];
 403         block[4] = pixels[4];
 404         block[5] = pixels[5];
 405         block[6] = pixels[6];
 406         block[7] = pixels[7];
 407         pixels += line_size;
 408         block += 8;
 409     }
 410 }
 411
 412 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 413                           const uint8_t *s2, int stride){
 414     int i;
 415
 416     /* read the pixels */
 417     for(i=0;i<8;i++) {
 418         block[0] = s1[0] - s2[0];
 419         block[1] = s1[1] - s2[1];
 420         block[2] = s1[2] - s2[2];
 421         block[3] = s1[3] - s2[3];
 422         block[4] = s1[4] - s2[4];
 423         block[5] = s1[5] - s2[5];
 424         block[6] = s1[6] - s2[6];
 425         block[7] = s1[7] - s2[7];
 426         s1 += stride;
 427         s2 += stride;
 428         block += 8;
 429     }
 430 }
 431
 432
 433 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 434                                  int line_size)
 435 {
 436     int i;
 437     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 438
 439     /* read the pixels */
 440     for(i=0;i<8;i++) {
 441         pixels[0] = cm[block[0]];
 442         pixels[1] = cm[block[1]];
 443         pixels[2] = cm[block[2]];
 444         pixels[3] = cm[block[3]];
 445         pixels[4] = cm[block[4]];
 446         pixels[5] = cm[block[5]];
 447         pixels[6] = cm[block[6]];
 448         pixels[7] = cm[block[7]];
 449
 450         pixels += line_size;
 451         block += 8;
 452     }
 453 }
 454
 455 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 456                                  int line_size)
 457 {
 458     int i;
 459     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 460
 461     /* read the pixels */
 462     for(i=0;i<4;i++) {
 463         pixels[0] = cm[block[0]];
 464         pixels[1] = cm[block[1]];
 465         pixels[2] = cm[block[2]];
 466         pixels[3] = cm[block[3]];
 467
 468         pixels += line_size;
 469         block += 8;
 470     }
 471 }
 472
 473 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 474                                  int line_size)
 475 {
 476     int i;
 477     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 478
 479     /* read the pixels */
 480     for(i=0;i<2;i++) {
 481         pixels[0] = cm[block[0]];
 482         pixels[1] = cm[block[1]];
 483
 484         pixels += line_size;
 485         block += 8;
 486     }
 487 }
 488
 489 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 490                                         uint8_t *restrict pixels,
 491                                         int line_size)
 492 {
 493     int i, j;
 494
 495     for (i = 0; i < 8; i++) {
 496         for (j = 0; j < 8; j++) {
 497             if (*block < -128)
 498                 *pixels = 0;
 499             else if (*block > 127)
 500                 *pixels = 255;
 501             else
 502                 *pixels = (uint8_t)(*block + 128);
 503             block++;
 504             pixels++;
 505         }
 506         pixels += (line_size - 8);
 507     }
 508 }
 509
 510 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 511                           int line_size)
 512 {
 513     int i;
 514     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 515
 516     /* read the pixels */
 517     for(i=0;i<8;i++) {
 518         pixels[0] = cm[pixels[0] + block[0]];
 519         pixels[1] = cm[pixels[1] + block[1]];
 520         pixels[2] = cm[pixels[2] + block[2]];
 521         pixels[3] = cm[pixels[3] + block[3]];
 522         pixels[4] = cm[pixels[4] + block[4]];
 523         pixels[5] = cm[pixels[5] + block[5]];
 524         pixels[6] = cm[pixels[6] + block[6]];
 525         pixels[7] = cm[pixels[7] + block[7]];
 526         pixels += line_size;
 527         block += 8;
 528     }
 529 }
 530
 531 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 532                           int line_size)
 533 {
 534     int i;
 535     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 536
 537     /* read the pixels */
 538     for(i=0;i<4;i++) {
 539         pixels[0] = cm[pixels[0] + block[0]];
 540         pixels[1] = cm[pixels[1] + block[1]];
 541         pixels[2] = cm[pixels[2] + block[2]];
 542         pixels[3] = cm[pixels[3] + block[3]];
 543         pixels += line_size;
 544         block += 8;
 545     }
 546 }
 547
 548 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 549                           int line_size)
 550 {
 551     int i;
 552     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 553
 554     /* read the pixels */
 555     for(i=0;i<2;i++) {
 556         pixels[0] = cm[pixels[0] + block[0]];
 557         pixels[1] = cm[pixels[1] + block[1]];
 558         pixels += line_size;
 559         block += 8;
 560     }
 561 }
 562
 563 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 564 {
 565     int i;
 566     for(i=0;i<8;i++) {
 567         pixels[0] += block[0];
 568         pixels[1] += block[1];
 569         pixels[2] += block[2];
 570         pixels[3] += block[3];
 571         pixels[4] += block[4];
 572         pixels[5] += block[5];
 573         pixels[6] += block[6];
 574         pixels[7] += block[7];
 575         pixels += line_size;
 576         block += 8;
 577     }
 578 }
 579
 580 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 581 {
 582     int i;
 583     for(i=0;i<4;i++) {
 584         pixels[0] += block[0];
 585         pixels[1] += block[1];
 586         pixels[2] += block[2];
 587         pixels[3] += block[3];
 588         pixels += line_size;
 589         block += 4;
 590     }
 591 }
 592
 593 #if 0
 594
 595 #define PIXOP2(OPNAME, OP) \
 596 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 597 {\
 598     int i;\
 599     for(i=0; i<h; i++){\
 600         OP(*((uint64_t*)block), LD64(pixels));\
 601         pixels+=line_size;\
 602         block +=line_size;\
 603     }\
 604 }\
 605 \
 606 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 607 {\
 608     int i;\
 609     for(i=0; i<h; i++){\
 610         const uint64_t a= LD64(pixels  );\
 611         const uint64_t b= LD64(pixels+1);\
 612         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 613         pixels+=line_size;\
 614         block +=line_size;\
 615     }\
 616 }\
 617 \
 618 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 619 {\
 620     int i;\
 621     for(i=0; i<h; i++){\
 622         const uint64_t a= LD64(pixels  );\
 623         const uint64_t b= LD64(pixels+1);\
 624         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 625         pixels+=line_size;\
 626         block +=line_size;\
 627     }\
 628 }\
 629 \
 630 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 631 {\
 632     int i;\
 633     for(i=0; i<h; i++){\
 634         const uint64_t a= LD64(pixels          );\
 635         const uint64_t b= LD64(pixels+line_size);\
 636         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 637         pixels+=line_size;\
 638         block +=line_size;\
 639     }\
 640 }\
 641 \
 642 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 643 {\
 644     int i;\
 645     for(i=0; i<h; i++){\
 646         const uint64_t a= LD64(pixels          );\
 647         const uint64_t b= LD64(pixels+line_size);\
 648         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 649         pixels+=line_size;\
 650         block +=line_size;\
 651     }\
 652 }\
 653 \
 654 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 655 {\
 656         int i;\
 657         const uint64_t a= LD64(pixels  );\
 658         const uint64_t b= LD64(pixels+1);\
 659         uint64_t l0=  (a&0x0303030303030303ULL)\
 660                     + (b&0x0303030303030303ULL)\
 661                     + 0x0202020202020202ULL;\
 662         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 663                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 664         uint64_t l1,h1;\
 665 \
 666         pixels+=line_size;\
 667         for(i=0; i<h; i+=2){\
 668             uint64_t a= LD64(pixels  );\
 669             uint64_t b= LD64(pixels+1);\
 670             l1=  (a&0x0303030303030303ULL)\
 671                + (b&0x0303030303030303ULL);\
 672             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 673               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 674             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 675             pixels+=line_size;\
 676             block +=line_size;\
 677             a= LD64(pixels  );\
 678             b= LD64(pixels+1);\
 679             l0=  (a&0x0303030303030303ULL)\
 680                + (b&0x0303030303030303ULL)\
 681                + 0x0202020202020202ULL;\
 682             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 683               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 684             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 685             pixels+=line_size;\
 686             block +=line_size;\
 687         }\
 688 }\
 689 \
 690 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 691 {\
 692         int i;\
 693         const uint64_t a= LD64(pixels  );\
 694         const uint64_t b= LD64(pixels+1);\
 695         uint64_t l0=  (a&0x0303030303030303ULL)\
 696                     + (b&0x0303030303030303ULL)\
 697                     + 0x0101010101010101ULL;\
 698         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 699                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 700         uint64_t l1,h1;\
 701 \
 702         pixels+=line_size;\
 703         for(i=0; i<h; i+=2){\
 704             uint64_t a= LD64(pixels  );\
 705             uint64_t b= LD64(pixels+1);\
 706             l1=  (a&0x0303030303030303ULL)\
 707                + (b&0x0303030303030303ULL);\
 708             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 709               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 710             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 711             pixels+=line_size;\
 712             block +=line_size;\
 713             a= LD64(pixels  );\
 714             b= LD64(pixels+1);\
 715             l0=  (a&0x0303030303030303ULL)\
 716                + (b&0x0303030303030303ULL)\
 717                + 0x0101010101010101ULL;\
 718             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 719               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 720             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 721             pixels+=line_size;\
 722             block +=line_size;\
 723         }\
 724 }\
 725 \
 726 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 727 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 728 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 729 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 730 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 731 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 732 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 733
 734 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 735 #else // 64 bit variant
 736
 737 #define PIXOP2(OPNAME, OP) \
 738 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 739     int i;\
 740     for(i=0; i<h; i++){\
 741         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
 742         pixels+=line_size;\
 743         block +=line_size;\
 744     }\
 745 }\
 746 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 747     int i;\
 748     for(i=0; i<h; i++){\
 749         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 750         pixels+=line_size;\
 751         block +=line_size;\
 752     }\
 753 }\
 754 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 755     int i;\
 756     for(i=0; i<h; i++){\
 757         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 758         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 759         pixels+=line_size;\
 760         block +=line_size;\
 761     }\
 762 }\
 763 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 764     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 765 }\
 766 \
 767 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 768                                                 int src_stride1, int src_stride2, int h){\
 769     int i;\
 770     for(i=0; i<h; i++){\
 771         uint32_t a,b;\
 772         a= LD32(&src1[i*src_stride1  ]);\
 773         b= LD32(&src2[i*src_stride2  ]);\
 774         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 775         a= LD32(&src1[i*src_stride1+4]);\
 776         b= LD32(&src2[i*src_stride2+4]);\
 777         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 778     }\
 779 }\
 780 \
 781 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 782                                                 int src_stride1, int src_stride2, int h){\
 783     int i;\
 784     for(i=0; i<h; i++){\
 785         uint32_t a,b;\
 786         a= LD32(&src1[i*src_stride1  ]);\
 787         b= LD32(&src2[i*src_stride2  ]);\
 788         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 789         a= LD32(&src1[i*src_stride1+4]);\
 790         b= LD32(&src2[i*src_stride2+4]);\
 791         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 792     }\
 793 }\
 794 \
 795 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 796                                                 int src_stride1, int src_stride2, int h){\
 797     int i;\
 798     for(i=0; i<h; i++){\
 799         uint32_t a,b;\
 800         a= LD32(&src1[i*src_stride1  ]);\
 801         b= LD32(&src2[i*src_stride2  ]);\
 802         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 803     }\
 804 }\
 805 \
 806 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 807                                                 int src_stride1, int src_stride2, int h){\
 808     int i;\
 809     for(i=0; i<h; i++){\
 810         uint32_t a,b;\
 811         a= LD16(&src1[i*src_stride1  ]);\
 812         b= LD16(&src2[i*src_stride2  ]);\
 813         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 814     }\
 815 }\
 816 \
 817 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 818                                                 int src_stride1, int src_stride2, int h){\
 819     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 820     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 821 }\
 822 \
 823 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 824                                                 int src_stride1, int src_stride2, int h){\
 825     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 826     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 827 }\
 828 \
 829 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 830     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 831 }\
 832 \
 833 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 834     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 835 }\
 836 \
 837 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 838     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 839 }\
 840 \
 841 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 842     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 843 }\
 844 \
 845 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 846                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 847     int i;\
 848     for(i=0; i<h; i++){\
 849         uint32_t a, b, c, d, l0, l1, h0, h1;\
 850         a= LD32(&src1[i*src_stride1]);\
 851         b= LD32(&src2[i*src_stride2]);\
 852         c= LD32(&src3[i*src_stride3]);\
 853         d= LD32(&src4[i*src_stride4]);\
 854         l0=  (a&0x03030303UL)\
 855            + (b&0x03030303UL)\
 856            + 0x02020202UL;\
 857         h0= ((a&0xFCFCFCFCUL)>>2)\
 858           + ((b&0xFCFCFCFCUL)>>2);\
 859         l1=  (c&0x03030303UL)\
 860            + (d&0x03030303UL);\
 861         h1= ((c&0xFCFCFCFCUL)>>2)\
 862           + ((d&0xFCFCFCFCUL)>>2);\
 863         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 864         a= LD32(&src1[i*src_stride1+4]);\
 865         b= LD32(&src2[i*src_stride2+4]);\
 866         c= LD32(&src3[i*src_stride3+4]);\
 867         d= LD32(&src4[i*src_stride4+4]);\
 868         l0=  (a&0x03030303UL)\
 869            + (b&0x03030303UL)\
 870            + 0x02020202UL;\
 871         h0= ((a&0xFCFCFCFCUL)>>2)\
 872           + ((b&0xFCFCFCFCUL)>>2);\
 873         l1=  (c&0x03030303UL)\
 874            + (d&0x03030303UL);\
 875         h1= ((c&0xFCFCFCFCUL)>>2)\
 876           + ((d&0xFCFCFCFCUL)>>2);\
 877         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 878     }\
 879 }\
 880 \
 881 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 882     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 883 }\
 884 \
 885 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 886     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 887 }\
 888 \
 889 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 890     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 891 }\
 892 \
 893 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 894     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 895 }\
 896 \
 897 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 898                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 899     int i;\
 900     for(i=0; i<h; i++){\
 901         uint32_t a, b, c, d, l0, l1, h0, h1;\
 902         a= LD32(&src1[i*src_stride1]);\
 903         b= LD32(&src2[i*src_stride2]);\
 904         c= LD32(&src3[i*src_stride3]);\
 905         d= LD32(&src4[i*src_stride4]);\
 906         l0=  (a&0x03030303UL)\
 907            + (b&0x03030303UL)\
 908            + 0x01010101UL;\
 909         h0= ((a&0xFCFCFCFCUL)>>2)\
 910           + ((b&0xFCFCFCFCUL)>>2);\
 911         l1=  (c&0x03030303UL)\
 912            + (d&0x03030303UL);\
 913         h1= ((c&0xFCFCFCFCUL)>>2)\
 914           + ((d&0xFCFCFCFCUL)>>2);\
 915         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 916         a= LD32(&src1[i*src_stride1+4]);\
 917         b= LD32(&src2[i*src_stride2+4]);\
 918         c= LD32(&src3[i*src_stride3+4]);\
 919         d= LD32(&src4[i*src_stride4+4]);\
 920         l0=  (a&0x03030303UL)\
 921            + (b&0x03030303UL)\
 922            + 0x01010101UL;\
 923         h0= ((a&0xFCFCFCFCUL)>>2)\
 924           + ((b&0xFCFCFCFCUL)>>2);\
 925         l1=  (c&0x03030303UL)\
 926            + (d&0x03030303UL);\
 927         h1= ((c&0xFCFCFCFCUL)>>2)\
 928           + ((d&0xFCFCFCFCUL)>>2);\
 929         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 930     }\
 931 }\
 932 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 933                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 934     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 935     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 936 }\
 937 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 938                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 939     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 940     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 941 }\
 942 \
 943 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 944 {\
 945         int i, a0, b0, a1, b1;\
 946         a0= pixels[0];\
 947         b0= pixels[1] + 2;\
 948         a0 += b0;\
 949         b0 += pixels[2];\
 950 \
 951         pixels+=line_size;\
 952         for(i=0; i<h; i+=2){\
 953             a1= pixels[0];\
 954             b1= pixels[1];\
 955             a1 += b1;\
 956             b1 += pixels[2];\
 957 \
 958             block[0]= (a1+a0)>>2; /* FIXME non put */\
 959             block[1]= (b1+b0)>>2;\
 960 \
 961             pixels+=line_size;\
 962             block +=line_size;\
 963 \
 964             a0= pixels[0];\
 965             b0= pixels[1] + 2;\
 966             a0 += b0;\
 967             b0 += pixels[2];\
 968 \
 969             block[0]= (a1+a0)>>2;\
 970             block[1]= (b1+b0)>>2;\
 971             pixels+=line_size;\
 972             block +=line_size;\
 973         }\
 974 }\
 975 \
 976 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 977 {\
 978         int i;\
 979         const uint32_t a= LD32(pixels  );\
 980         const uint32_t b= LD32(pixels+1);\
 981         uint32_t l0=  (a&0x03030303UL)\
 982                     + (b&0x03030303UL)\
 983                     + 0x02020202UL;\
 984         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 985                    + ((b&0xFCFCFCFCUL)>>2);\
 986         uint32_t l1,h1;\
 987 \
 988         pixels+=line_size;\
 989         for(i=0; i<h; i+=2){\
 990             uint32_t a= LD32(pixels  );\
 991             uint32_t b= LD32(pixels+1);\
 992             l1=  (a&0x03030303UL)\
 993                + (b&0x03030303UL);\
 994             h1= ((a&0xFCFCFCFCUL)>>2)\
 995               + ((b&0xFCFCFCFCUL)>>2);\
 996             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 997             pixels+=line_size;\
 998             block +=line_size;\
 999             a= LD32(pixels  );\
1000             b= LD32(pixels+1);\
1001             l0=  (a&0x03030303UL)\
1002                + (b&0x03030303UL)\
1003                + 0x02020202UL;\
1004             h0= ((a&0xFCFCFCFCUL)>>2)\
1005               + ((b&0xFCFCFCFCUL)>>2);\
1006             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1007             pixels+=line_size;\
1008             block +=line_size;\
1009         }\
1010 }\
1011 \
1012 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1013 {\
1014     int j;\
1015     for(j=0; j<2; j++){\
1016         int i;\
1017         const uint32_t a= LD32(pixels  );\
1018         const uint32_t b= LD32(pixels+1);\
1019         uint32_t l0=  (a&0x03030303UL)\
1020                     + (b&0x03030303UL)\
1021                     + 0x02020202UL;\
1022         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1023                    + ((b&0xFCFCFCFCUL)>>2);\
1024         uint32_t l1,h1;\
1025 \
1026         pixels+=line_size;\
1027         for(i=0; i<h; i+=2){\
1028             uint32_t a= LD32(pixels  );\
1029             uint32_t b= LD32(pixels+1);\
1030             l1=  (a&0x03030303UL)\
1031                + (b&0x03030303UL);\
1032             h1= ((a&0xFCFCFCFCUL)>>2)\
1033               + ((b&0xFCFCFCFCUL)>>2);\
1034             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1035             pixels+=line_size;\
1036             block +=line_size;\
1037             a= LD32(pixels  );\
1038             b= LD32(pixels+1);\
1039             l0=  (a&0x03030303UL)\
1040                + (b&0x03030303UL)\
1041                + 0x02020202UL;\
1042             h0= ((a&0xFCFCFCFCUL)>>2)\
1043               + ((b&0xFCFCFCFCUL)>>2);\
1044             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1045             pixels+=line_size;\
1046             block +=line_size;\
1047         }\
1048         pixels+=4-line_size*(h+1);\
1049         block +=4-line_size*h;\
1050     }\
1051 }\
1052 \
1053 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1054 {\
1055     int j;\
1056     for(j=0; j<2; j++){\
1057         int i;\
1058         const uint32_t a= LD32(pixels  );\
1059         const uint32_t b= LD32(pixels+1);\
1060         uint32_t l0=  (a&0x03030303UL)\
1061                     + (b&0x03030303UL)\
1062                     + 0x01010101UL;\
1063         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1064                    + ((b&0xFCFCFCFCUL)>>2);\
1065         uint32_t l1,h1;\
1066 \
1067         pixels+=line_size;\
1068         for(i=0; i<h; i+=2){\
1069             uint32_t a= LD32(pixels  );\
1070             uint32_t b= LD32(pixels+1);\
1071             l1=  (a&0x03030303UL)\
1072                + (b&0x03030303UL);\
1073             h1= ((a&0xFCFCFCFCUL)>>2)\
1074               + ((b&0xFCFCFCFCUL)>>2);\
1075             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1076             pixels+=line_size;\
1077             block +=line_size;\
1078             a= LD32(pixels  );\
1079             b= LD32(pixels+1);\
1080             l0=  (a&0x03030303UL)\
1081                + (b&0x03030303UL)\
1082                + 0x01010101UL;\
1083             h0= ((a&0xFCFCFCFCUL)>>2)\
1084               + ((b&0xFCFCFCFCUL)>>2);\
1085             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1086             pixels+=line_size;\
1087             block +=line_size;\
1088         }\
1089         pixels+=4-line_size*(h+1);\
1090         block +=4-line_size*h;\
1091     }\
1092 }\
1093 \
1094 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1095 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1096 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1097 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1098 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1099 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1100 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1101 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1102
1103 #define op_avg(a, b) a = rnd_avg32(a, b)
1104 #endif
1105 #define op_put(a, b) a = b
1106
1107 PIXOP2(avg, op_avg)
1108 PIXOP2(put, op_put)
1109 #undef op_avg
1110 #undef op_put
1111
1112 #define avg2(a,b) ((a+b+1)>>1)
1113 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1114
1115 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1116     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1117 }
1118
1119 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1120     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1121 }
1122
1123 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1124 {
1125     const int A=(16-x16)*(16-y16);
1126     const int B=(   x16)*(16-y16);
1127     const int C=(16-x16)*(   y16);
1128     const int D=(   x16)*(   y16);
1129     int i;
1130
1131     for(i=0; i<h; i++)
1132     {
1133         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1134         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1135         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1136         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1137         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1138         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1139         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1140         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1141         dst+= stride;
1142         src+= stride;
1143     }
1144 }
1145
1146 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1147                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1148 {
1149     int y, vx, vy;
1150     const int s= 1<<shift;
1151
1152     width--;
1153     height--;
1154
1155     for(y=0; y<h; y++){
1156         int x;
1157
1158         vx= ox;
1159         vy= oy;
1160         for(x=0; x<8; x++){ //XXX FIXME optimize
1161             int src_x, src_y, frac_x, frac_y, index;
1162
1163             src_x= vx>>16;
1164             src_y= vy>>16;
1165             frac_x= src_x&(s-1);
1166             frac_y= src_y&(s-1);
1167             src_x>>=shift;
1168             src_y>>=shift;
1169
1170             if((unsigned)src_x < width){
1171                 if((unsigned)src_y < height){
1172                     index= src_x + src_y*stride;
1173                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1174                                            + src[index       +1]*   frac_x )*(s-frac_y)
1175                                         + (  src[index+stride  ]*(s-frac_x)
1176                                            + src[index+stride+1]*   frac_x )*   frac_y
1177                                         + r)>>(shift*2);
1178                 }else{
1179                     index= src_x + clip(src_y, 0, height)*stride;
1180                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1181                                           + src[index       +1]*   frac_x )*s
1182                                         + r)>>(shift*2);
1183                 }
1184             }else{
1185                 if((unsigned)src_y < height){
1186                     index= clip(src_x, 0, width) + src_y*stride;
1187                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1188                                            + src[index+stride  ]*   frac_y )*s
1189                                         + r)>>(shift*2);
1190                 }else{
1191                     index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1192                     dst[y*stride + x]=    src[index         ];
1193                 }
1194             }
1195
1196             vx+= dxx;
1197             vy+= dyx;
1198         }
1199         ox += dxy;
1200         oy += dyy;
1201     }
1202 }
1203
1204 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1205     switch(width){
1206     case 2: put_pixels2_c (dst, src, stride, height); break;
1207     case 4: put_pixels4_c (dst, src, stride, height); break;
1208     case 8: put_pixels8_c (dst, src, stride, height); break;
1209     case 16:put_pixels16_c(dst, src, stride, height); break;
1210     }
1211 }
1212
1213 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1214     int i,j;
1215     for (i=0; i < height; i++) {
1216       for (j=0; j < width; j++) {
1217         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1218       }
1219       src += stride;
1220       dst += stride;
1221     }
1222 }
1223
1224 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1225     int i,j;
1226     for (i=0; i < height; i++) {
1227       for (j=0; j < width; j++) {
1228         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1229       }
1230       src += stride;
1231       dst += stride;
1232     }
1233 }
1234
1235 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1236     int i,j;
1237     for (i=0; i < height; i++) {
1238       for (j=0; j < width; j++) {
1239         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1240       }
1241       src += stride;
1242       dst += stride;
1243     }
1244 }
1245
1246 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1247     int i,j;
1248     for (i=0; i < height; i++) {
1249       for (j=0; j < width; j++) {
1250         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1251       }
1252       src += stride;
1253       dst += stride;
1254     }
1255 }
1256
1257 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1258     int i,j;
1259     for (i=0; i < height; i++) {
1260       for (j=0; j < width; j++) {
1261         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1262       }
1263       src += stride;
1264       dst += stride;
1265     }
1266 }
1267
1268 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1269     int i,j;
1270     for (i=0; i < height; i++) {
1271       for (j=0; j < width; j++) {
1272         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1273       }
1274       src += stride;
1275       dst += stride;
1276     }
1277 }
1278
1279 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280     int i,j;
1281     for (i=0; i < height; i++) {
1282       for (j=0; j < width; j++) {
1283         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1284       }
1285       src += stride;
1286       dst += stride;
1287     }
1288 }
1289
1290 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291     int i,j;
1292     for (i=0; i < height; i++) {
1293       for (j=0; j < width; j++) {
1294         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1295       }
1296       src += stride;
1297       dst += stride;
1298     }
1299 }
1300
1301 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1302     switch(width){
1303     case 2: avg_pixels2_c (dst, src, stride, height); break;
1304     case 4: avg_pixels4_c (dst, src, stride, height); break;
1305     case 8: avg_pixels8_c (dst, src, stride, height); break;
1306     case 16:avg_pixels16_c(dst, src, stride, height); break;
1307     }
1308 }
1309
1310 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1311     int i,j;
1312     for (i=0; i < height; i++) {
1313       for (j=0; j < width; j++) {
1314         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1315       }
1316       src += stride;
1317       dst += stride;
1318     }
1319 }
1320
1321 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1322     int i,j;
1323     for (i=0; i < height; i++) {
1324       for (j=0; j < width; j++) {
1325         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1326       }
1327       src += stride;
1328       dst += stride;
1329     }
1330 }
1331
1332 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1333     int i,j;
1334     for (i=0; i < height; i++) {
1335       for (j=0; j < width; j++) {
1336         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1337       }
1338       src += stride;
1339       dst += stride;
1340     }
1341 }
1342
1343 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1344     int i,j;
1345     for (i=0; i < height; i++) {
1346       for (j=0; j < width; j++) {
1347         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1348       }
1349       src += stride;
1350       dst += stride;
1351     }
1352 }
1353
1354 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355     int i,j;
1356     for (i=0; i < height; i++) {
1357       for (j=0; j < width; j++) {
1358         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1359       }
1360       src += stride;
1361       dst += stride;
1362     }
1363 }
1364
1365 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1366     int i,j;
1367     for (i=0; i < height; i++) {
1368       for (j=0; j < width; j++) {
1369         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1370       }
1371       src += stride;
1372       dst += stride;
1373     }
1374 }
1375
1376 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377     int i,j;
1378     for (i=0; i < height; i++) {
1379       for (j=0; j < width; j++) {
1380         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1381       }
1382       src += stride;
1383       dst += stride;
1384     }
1385 }
1386
1387 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1388     int i,j;
1389     for (i=0; i < height; i++) {
1390       for (j=0; j < width; j++) {
1391         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1392       }
1393       src += stride;
1394       dst += stride;
1395     }
1396 }
1397 #if 0
1398 #define TPEL_WIDTH(width)\
1399 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1400     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1401 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1402     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1403 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1404     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1405 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1406     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1407 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1408     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1409 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1411 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1413 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1417 #endif
1418
1419 #define H264_CHROMA_MC(OPNAME, OP)\
1420 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1421     const int A=(8-x)*(8-y);\
1422     const int B=(  x)*(8-y);\
1423     const int C=(8-x)*(  y);\
1424     const int D=(  x)*(  y);\
1425     int i;\
1426     \
1427     assert(x<8 && y<8 && x>=0 && y>=0);\
1428 \
1429     for(i=0; i<h; i++)\
1430     {\
1431         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1432         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1433         dst+= stride;\
1434         src+= stride;\
1435     }\
1436 }\
1437 \
1438 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1439     const int A=(8-x)*(8-y);\
1440     const int B=(  x)*(8-y);\
1441     const int C=(8-x)*(  y);\
1442     const int D=(  x)*(  y);\
1443     int i;\
1444     \
1445     assert(x<8 && y<8 && x>=0 && y>=0);\
1446 \
1447     for(i=0; i<h; i++)\
1448     {\
1449         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1450         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1451         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1452         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1453         dst+= stride;\
1454         src+= stride;\
1455     }\
1456 }\
1457 \
1458 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1459     const int A=(8-x)*(8-y);\
1460     const int B=(  x)*(8-y);\
1461     const int C=(8-x)*(  y);\
1462     const int D=(  x)*(  y);\
1463     int i;\
1464     \
1465     assert(x<8 && y<8 && x>=0 && y>=0);\
1466 \
1467     for(i=0; i<h; i++)\
1468     {\
1469         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1470         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1471         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1472         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1473         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1474         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1475         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1476         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1477         dst+= stride;\
1478         src+= stride;\
1479     }\
1480 }
1481
1482 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1483 #define op_put(a, b) a = (((b) + 32)>>6)
1484
1485 H264_CHROMA_MC(put_       , op_put)
1486 H264_CHROMA_MC(avg_       , op_avg)
1487 #undef op_avg
1488 #undef op_put
1489
1490 static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1491 {
1492     int i;
1493     for(i=0; i<h; i++)
1494     {
1495         ST16(dst   , LD16(src   ));
1496         dst+=dstStride;
1497         src+=srcStride;
1498     }
1499 }
1500
1501 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1502 {
1503     int i;
1504     for(i=0; i<h; i++)
1505     {
1506         ST32(dst   , LD32(src   ));
1507         dst+=dstStride;
1508         src+=srcStride;
1509     }
1510 }
1511
1512 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1513 {
1514     int i;
1515     for(i=0; i<h; i++)
1516     {
1517         ST32(dst   , LD32(src   ));
1518         ST32(dst+4 , LD32(src+4 ));
1519         dst+=dstStride;
1520         src+=srcStride;
1521     }
1522 }
1523
1524 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1525 {
1526     int i;
1527     for(i=0; i<h; i++)
1528     {
1529         ST32(dst   , LD32(src   ));
1530         ST32(dst+4 , LD32(src+4 ));
1531         ST32(dst+8 , LD32(src+8 ));
1532         ST32(dst+12, LD32(src+12));
1533         dst+=dstStride;
1534         src+=srcStride;
1535     }
1536 }
1537
1538 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1539 {
1540     int i;
1541     for(i=0; i<h; i++)
1542     {
1543         ST32(dst   , LD32(src   ));
1544         ST32(dst+4 , LD32(src+4 ));
1545         ST32(dst+8 , LD32(src+8 ));
1546         ST32(dst+12, LD32(src+12));
1547         dst[16]= src[16];
1548         dst+=dstStride;
1549         src+=srcStride;
1550     }
1551 }
1552
1553 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1554 {
1555     int i;
1556     for(i=0; i<h; i++)
1557     {
1558         ST32(dst   , LD32(src   ));
1559         ST32(dst+4 , LD32(src+4 ));
1560         dst[8]= src[8];
1561         dst+=dstStride;
1562         src+=srcStride;
1563     }
1564 }
1565
1566
1567 #define QPEL_MC(r, OPNAME, RND, OP) \
1568 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1569     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1570     int i;\
1571     for(i=0; i<h; i++)\
1572     {\
1573         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1574         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1575         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1576         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1577         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1578         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1579         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1580         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1581         dst+=dstStride;\
1582         src+=srcStride;\
1583     }\
1584 }\
1585 \
1586 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1587     const int w=8;\
1588     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1589     int i;\
1590     for(i=0; i<w; i++)\
1591     {\
1592         const int src0= src[0*srcStride];\
1593         const int src1= src[1*srcStride];\
1594         const int src2= src[2*srcStride];\
1595         const int src3= src[3*srcStride];\
1596         const int src4= src[4*srcStride];\
1597         const int src5= src[5*srcStride];\
1598         const int src6= src[6*srcStride];\
1599         const int src7= src[7*srcStride];\
1600         const int src8= src[8*srcStride];\
1601         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1602         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1603         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1604         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1605         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1606         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1607         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1608         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1609         dst++;\
1610         src++;\
1611     }\
1612 }\
1613 \
1614 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1615     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1616     int i;\
1617     \
1618     for(i=0; i<h; i++)\
1619     {\
1620         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1621         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1622         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1623         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1624         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1625         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1626         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1627         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1628         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1629         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1630         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1631         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1632         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1633         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1634         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1635         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1636         dst+=dstStride;\
1637         src+=srcStride;\
1638     }\
1639 }\
1640 \
1641 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1642     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1643     int i;\
1644     const int w=16;\
1645     for(i=0; i<w; i++)\
1646     {\
1647         const int src0= src[0*srcStride];\
1648         const int src1= src[1*srcStride];\
1649         const int src2= src[2*srcStride];\
1650         const int src3= src[3*srcStride];\
1651         const int src4= src[4*srcStride];\
1652         const int src5= src[5*srcStride];\
1653         const int src6= src[6*srcStride];\
1654         const int src7= src[7*srcStride];\
1655         const int src8= src[8*srcStride];\
1656         const int src9= src[9*srcStride];\
1657         const int src10= src[10*srcStride];\
1658         const int src11= src[11*srcStride];\
1659         const int src12= src[12*srcStride];\
1660         const int src13= src[13*srcStride];\
1661         const int src14= src[14*srcStride];\
1662         const int src15= src[15*srcStride];\
1663         const int src16= src[16*srcStride];\
1664         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1665         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1666         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1667         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1668         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1669         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1670         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1671         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1672         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1673         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1674         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1675         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1676         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1677         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1678         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1679         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1680         dst++;\
1681         src++;\
1682     }\
1683 }\
1684 \
1685 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1686     OPNAME ## pixels8_c(dst, src, stride, 8);\
1687 }\
1688 \
1689 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1690     uint8_t half[64];\
1691     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1692     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1693 }\
1694 \
1695 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1696     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1697 }\
1698 \
1699 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1700     uint8_t half[64];\
1701     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1702     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1703 }\
1704 \
1705 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1706     uint8_t full[16*9];\
1707     uint8_t half[64];\
1708     copy_block9(full, src, 16, stride, 9);\
1709     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1710     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1711 }\
1712 \
1713 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1714     uint8_t full[16*9];\
1715     copy_block9(full, src, 16, stride, 9);\
1716     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1717 }\
1718 \
1719 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1720     uint8_t full[16*9];\
1721     uint8_t half[64];\
1722     copy_block9(full, src, 16, stride, 9);\
1723     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1724     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1725 }\
1726 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1727     uint8_t full[16*9];\
1728     uint8_t halfH[72];\
1729     uint8_t halfV[64];\
1730     uint8_t halfHV[64];\
1731     copy_block9(full, src, 16, stride, 9);\
1732     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1733     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1734     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1735     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1736 }\
1737 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1738     uint8_t full[16*9];\
1739     uint8_t halfH[72];\
1740     uint8_t halfHV[64];\
1741     copy_block9(full, src, 16, stride, 9);\
1742     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1743     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1744     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1745     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1746 }\
1747 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1748     uint8_t full[16*9];\
1749     uint8_t halfH[72];\
1750     uint8_t halfV[64];\
1751     uint8_t halfHV[64];\
1752     copy_block9(full, src, 16, stride, 9);\
1753     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1754     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1755     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1756     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1757 }\
1758 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1759     uint8_t full[16*9];\
1760     uint8_t halfH[72];\
1761     uint8_t halfHV[64];\
1762     copy_block9(full, src, 16, stride, 9);\
1763     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1764     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1765     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1766     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1767 }\
1768 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1769     uint8_t full[16*9];\
1770     uint8_t halfH[72];\
1771     uint8_t halfV[64];\
1772     uint8_t halfHV[64];\
1773     copy_block9(full, src, 16, stride, 9);\
1774     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1775     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1776     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1777     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1778 }\
1779 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1780     uint8_t full[16*9];\
1781     uint8_t halfH[72];\
1782     uint8_t halfHV[64];\
1783     copy_block9(full, src, 16, stride, 9);\
1784     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1785     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1786     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1787     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1788 }\
1789 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1790     uint8_t full[16*9];\
1791     uint8_t halfH[72];\
1792     uint8_t halfV[64];\
1793     uint8_t halfHV[64];\
1794     copy_block9(full, src, 16, stride, 9);\
1795     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1796     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1797     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1798     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1799 }\
1800 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1801     uint8_t full[16*9];\
1802     uint8_t halfH[72];\
1803     uint8_t halfHV[64];\
1804     copy_block9(full, src, 16, stride, 9);\
1805     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1806     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1807     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1808     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1809 }\
1810 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1811     uint8_t halfH[72];\
1812     uint8_t halfHV[64];\
1813     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1814     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1815     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1816 }\
1817 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1818     uint8_t halfH[72];\
1819     uint8_t halfHV[64];\
1820     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1821     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1822     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1823 }\
1824 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1825     uint8_t full[16*9];\
1826     uint8_t halfH[72];\
1827     uint8_t halfV[64];\
1828     uint8_t halfHV[64];\
1829     copy_block9(full, src, 16, stride, 9);\
1830     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1831     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1832     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1833     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1834 }\
1835 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1836     uint8_t full[16*9];\
1837     uint8_t halfH[72];\
1838     copy_block9(full, src, 16, stride, 9);\
1839     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1840     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1841     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1842 }\
1843 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1844     uint8_t full[16*9];\
1845     uint8_t halfH[72];\
1846     uint8_t halfV[64];\
1847     uint8_t halfHV[64];\
1848     copy_block9(full, src, 16, stride, 9);\
1849     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1850     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1851     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1852     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1853 }\
1854 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1855     uint8_t full[16*9];\
1856     uint8_t halfH[72];\
1857     copy_block9(full, src, 16, stride, 9);\
1858     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1859     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1860     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1861 }\
1862 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1863     uint8_t halfH[72];\
1864     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1865     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1866 }\
1867 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1868     OPNAME ## pixels16_c(dst, src, stride, 16);\
1869 }\
1870 \
1871 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1872     uint8_t half[256];\
1873     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1874     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1875 }\
1876 \
1877 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1878     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1879 }\
1880 \
1881 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1882     uint8_t half[256];\
1883     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1884     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1885 }\
1886 \
1887 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1888     uint8_t full[24*17];\
1889     uint8_t half[256];\
1890     copy_block17(full, src, 24, stride, 17);\
1891     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1892     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1893 }\
1894 \
1895 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1896     uint8_t full[24*17];\
1897     copy_block17(full, src, 24, stride, 17);\
1898     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1899 }\
1900 \
1901 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1902     uint8_t full[24*17];\
1903     uint8_t half[256];\
1904     copy_block17(full, src, 24, stride, 17);\
1905     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1906     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1907 }\
1908 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1909     uint8_t full[24*17];\
1910     uint8_t halfH[272];\
1911     uint8_t halfV[256];\
1912     uint8_t halfHV[256];\
1913     copy_block17(full, src, 24, stride, 17);\
1914     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1915     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1916     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1917     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1918 }\
1919 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1920     uint8_t full[24*17];\
1921     uint8_t halfH[272];\
1922     uint8_t halfHV[256];\
1923     copy_block17(full, src, 24, stride, 17);\
1924     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1925     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1926     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1927     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1928 }\
1929 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1930     uint8_t full[24*17];\
1931     uint8_t halfH[272];\
1932     uint8_t halfV[256];\
1933     uint8_t halfHV[256];\
1934     copy_block17(full, src, 24, stride, 17);\
1935     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1936     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1937     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1938     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1939 }\
1940 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1941     uint8_t full[24*17];\
1942     uint8_t halfH[272];\
1943     uint8_t halfHV[256];\
1944     copy_block17(full, src, 24, stride, 17);\
1945     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1946     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1947     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1948     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1949 }\
1950 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1951     uint8_t full[24*17];\
1952     uint8_t halfH[272];\
1953     uint8_t halfV[256];\
1954     uint8_t halfHV[256];\
1955     copy_block17(full, src, 24, stride, 17);\
1956     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1957     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1958     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1959     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1960 }\
1961 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1962     uint8_t full[24*17];\
1963     uint8_t halfH[272];\
1964     uint8_t halfHV[256];\
1965     copy_block17(full, src, 24, stride, 17);\
1966     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1967     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1968     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1969     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1970 }\
1971 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1972     uint8_t full[24*17];\
1973     uint8_t halfH[272];\
1974     uint8_t halfV[256];\
1975     uint8_t halfHV[256];\
1976     copy_block17(full, src, 24, stride, 17);\
1977     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1978     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1979     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1980     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1981 }\
1982 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1983     uint8_t full[24*17];\
1984     uint8_t halfH[272];\
1985     uint8_t halfHV[256];\
1986     copy_block17(full, src, 24, stride, 17);\
1987     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1988     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1989     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1990     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1991 }\
1992 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1993     uint8_t halfH[272];\
1994     uint8_t halfHV[256];\
1995     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1996     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1997     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1998 }\
1999 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2000     uint8_t halfH[272];\
2001     uint8_t halfHV[256];\
2002     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2003     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2004     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2005 }\
2006 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2007     uint8_t full[24*17];\
2008     uint8_t halfH[272];\
2009     uint8_t halfV[256];\
2010     uint8_t halfHV[256];\
2011     copy_block17(full, src, 24, stride, 17);\
2012     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2013     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2014     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2015     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2016 }\
2017 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2018     uint8_t full[24*17];\
2019     uint8_t halfH[272];\
2020     copy_block17(full, src, 24, stride, 17);\
2021     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2022     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2023     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2024 }\
2025 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2026     uint8_t full[24*17];\
2027     uint8_t halfH[272];\
2028     uint8_t halfV[256];\
2029     uint8_t halfHV[256];\
2030     copy_block17(full, src, 24, stride, 17);\
2031     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2032     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2033     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2034     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2035 }\
2036 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2037     uint8_t full[24*17];\
2038     uint8_t halfH[272];\
2039     copy_block17(full, src, 24, stride, 17);\
2040     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2041     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2042     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2043 }\
2044 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2045     uint8_t halfH[272];\
2046     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2047     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2048 }
2049
2050 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2051 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2052 #define op_put(a, b) a = cm[((b) + 16)>>5]
2053 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2054
2055 QPEL_MC(0, put_       , _       , op_put)
2056 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2057 QPEL_MC(0, avg_       , _       , op_avg)
2058 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2059 #undef op_avg
2060 #undef op_avg_no_rnd
2061 #undef op_put
2062 #undef op_put_no_rnd
2063
2064 #if 1
2065 #define H264_LOWPASS(OPNAME, OP, OP2) \
2066 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2067     const int h=2;\
2068     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2069     int i;\
2070     for(i=0; i<h; i++)\
2071     {\
2072         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2073         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2074         dst+=dstStride;\
2075         src+=srcStride;\
2076     }\
2077 }\
2078 \
2079 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2080     const int w=2;\
2081     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2082     int i;\
2083     for(i=0; i<w; i++)\
2084     {\
2085         const int srcB= src[-2*srcStride];\
2086         const int srcA= src[-1*srcStride];\
2087         const int src0= src[0 *srcStride];\
2088         const int src1= src[1 *srcStride];\
2089         const int src2= src[2 *srcStride];\
2090         const int src3= src[3 *srcStride];\
2091         const int src4= src[4 *srcStride];\
2092         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2093         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2094         dst++;\
2095         src++;\
2096     }\
2097 }\
2098 \
2099 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2100     const int h=2;\
2101     const int w=2;\
2102     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2103     int i;\
2104     src -= 2*srcStride;\
2105     for(i=0; i<h+5; i++)\
2106     {\
2107         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2108         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2109         tmp+=tmpStride;\
2110         src+=srcStride;\
2111     }\
2112     tmp -= tmpStride*(h+5-2);\
2113     for(i=0; i<w; i++)\
2114     {\
2115         const int tmpB= tmp[-2*tmpStride];\
2116         const int tmpA= tmp[-1*tmpStride];\
2117         const int tmp0= tmp[0 *tmpStride];\
2118         const int tmp1= tmp[1 *tmpStride];\
2119         const int tmp2= tmp[2 *tmpStride];\
2120         const int tmp3= tmp[3 *tmpStride];\
2121         const int tmp4= tmp[4 *tmpStride];\
2122         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2123         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2124         dst++;\
2125         tmp++;\
2126     }\
2127 }\
2128 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2129     const int h=4;\
2130     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2131     int i;\
2132     for(i=0; i<h; i++)\
2133     {\
2134         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2135         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2136         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2137         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2138         dst+=dstStride;\
2139         src+=srcStride;\
2140     }\
2141 }\
2142 \
2143 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2144     const int w=4;\
2145     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2146     int i;\
2147     for(i=0; i<w; i++)\
2148     {\
2149         const int srcB= src[-2*srcStride];\
2150         const int srcA= src[-1*srcStride];\
2151         const int src0= src[0 *srcStride];\
2152         const int src1= src[1 *srcStride];\
2153         const int src2= src[2 *srcStride];\
2154         const int src3= src[3 *srcStride];\
2155         const int src4= src[4 *srcStride];\
2156         const int src5= src[5 *srcStride];\
2157         const int src6= src[6 *srcStride];\
2158         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2159         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2160         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2161         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2162         dst++;\
2163         src++;\
2164     }\
2165 }\
2166 \
2167 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2168     const int h=4;\
2169     const int w=4;\
2170     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2171     int i;\
2172     src -= 2*srcStride;\
2173     for(i=0; i<h+5; i++)\
2174     {\
2175         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2176         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2177         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2178         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2179         tmp+=tmpStride;\
2180         src+=srcStride;\
2181     }\
2182     tmp -= tmpStride*(h+5-2);\
2183     for(i=0; i<w; i++)\
2184     {\
2185         const int tmpB= tmp[-2*tmpStride];\
2186         const int tmpA= tmp[-1*tmpStride];\
2187         const int tmp0= tmp[0 *tmpStride];\
2188         const int tmp1= tmp[1 *tmpStride];\
2189         const int tmp2= tmp[2 *tmpStride];\
2190         const int tmp3= tmp[3 *tmpStride];\
2191         const int tmp4= tmp[4 *tmpStride];\
2192         const int tmp5= tmp[5 *tmpStride];\
2193         const int tmp6= tmp[6 *tmpStride];\
2194         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2195         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2196         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2197         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2198         dst++;\
2199         tmp++;\
2200     }\
2201 }\
2202 \
2203 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2204     const int h=8;\
2205     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2206     int i;\
2207     for(i=0; i<h; i++)\
2208     {\
2209         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2210         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2211         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2212         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2213         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2214         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2215         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2216         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2217         dst+=dstStride;\
2218         src+=srcStride;\
2219     }\
2220 }\
2221 \
2222 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2223     const int w=8;\
2224     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2225     int i;\
2226     for(i=0; i<w; i++)\
2227     {\
2228         const int srcB= src[-2*srcStride];\
2229         const int srcA= src[-1*srcStride];\
2230         const int src0= src[0 *srcStride];\
2231         const int src1= src[1 *srcStride];\
2232         const int src2= src[2 *srcStride];\
2233         const int src3= src[3 *srcStride];\
2234         const int src4= src[4 *srcStride];\
2235         const int src5= src[5 *srcStride];\
2236         const int src6= src[6 *srcStride];\
2237         const int src7= src[7 *srcStride];\
2238         const int src8= src[8 *srcStride];\
2239         const int src9= src[9 *srcStride];\
2240         const int src10=src[10*srcStride];\
2241         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2242         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2243         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2244         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2245         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2246         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2247         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2248         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2249         dst++;\
2250         src++;\
2251     }\
2252 }\
2253 \
2254 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2255     const int h=8;\
2256     const int w=8;\
2257     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2258     int i;\
2259     src -= 2*srcStride;\
2260     for(i=0; i<h+5; i++)\
2261     {\
2262         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2263         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2264         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2265         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2266         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2267         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2268         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2269         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2270         tmp+=tmpStride;\
2271         src+=srcStride;\
2272     }\
2273     tmp -= tmpStride*(h+5-2);\
2274     for(i=0; i<w; i++)\
2275     {\
2276         const int tmpB= tmp[-2*tmpStride];\
2277         const int tmpA= tmp[-1*tmpStride];\
2278         const int tmp0= tmp[0 *tmpStride];\
2279         const int tmp1= tmp[1 *tmpStride];\
2280         const int tmp2= tmp[2 *tmpStride];\
2281         const int tmp3= tmp[3 *tmpStride];\
2282         const int tmp4= tmp[4 *tmpStride];\
2283         const int tmp5= tmp[5 *tmpStride];\
2284         const int tmp6= tmp[6 *tmpStride];\
2285         const int tmp7= tmp[7 *tmpStride];\
2286         const int tmp8= tmp[8 *tmpStride];\
2287         const int tmp9= tmp[9 *tmpStride];\
2288         const int tmp10=tmp[10*tmpStride];\
2289         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2290         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2291         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2292         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2293         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2294         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2295         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2296         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2297         dst++;\
2298         tmp++;\
2299     }\
2300 }\
2301 \
2302 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2303     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2304     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2305     src += 8*srcStride;\
2306     dst += 8*dstStride;\
2307     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2308     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2309 }\
2310 \
2311 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2312     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2313     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2314     src += 8*srcStride;\
2315     dst += 8*dstStride;\
2316     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2317     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2318 }\
2319 \
2320 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2321     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2322     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2323     src += 8*srcStride;\
2324     dst += 8*dstStride;\
2325     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2326     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2327 }\
2328
2329 #define H264_MC(OPNAME, SIZE) \
2330 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2331     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2332 }\
2333 \
2334 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2335     uint8_t half[SIZE*SIZE];\
2336     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2337     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2338 }\
2339 \
2340 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2341     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2342 }\
2343 \
2344 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2345     uint8_t half[SIZE*SIZE];\
2346     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2347     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2348 }\
2349 \
2350 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2351     uint8_t full[SIZE*(SIZE+5)];\
2352     uint8_t * const full_mid= full + SIZE*2;\
2353     uint8_t half[SIZE*SIZE];\
2354     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2355     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2356     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2357 }\
2358 \
2359 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2360     uint8_t full[SIZE*(SIZE+5)];\
2361     uint8_t * const full_mid= full + SIZE*2;\
2362     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2363     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2364 }\
2365 \
2366 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2367     uint8_t full[SIZE*(SIZE+5)];\
2368     uint8_t * const full_mid= full + SIZE*2;\
2369     uint8_t half[SIZE*SIZE];\
2370     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2371     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2372     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2373 }\
2374 \
2375 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2376     uint8_t full[SIZE*(SIZE+5)];\
2377     uint8_t * const full_mid= full + SIZE*2;\
2378     uint8_t halfH[SIZE*SIZE];\
2379     uint8_t halfV[SIZE*SIZE];\
2380     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2381     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2382     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2383     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2384 }\
2385 \
2386 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2387     uint8_t full[SIZE*(SIZE+5)];\
2388     uint8_t * const full_mid= full + SIZE*2;\
2389     uint8_t halfH[SIZE*SIZE];\
2390     uint8_t halfV[SIZE*SIZE];\
2391     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2392     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2393     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2394     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2395 }\
2396 \
2397 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2398     uint8_t full[SIZE*(SIZE+5)];\
2399     uint8_t * const full_mid= full + SIZE*2;\
2400     uint8_t halfH[SIZE*SIZE];\
2401     uint8_t halfV[SIZE*SIZE];\
2402     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2403     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2404     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2405     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2406 }\
2407 \
2408 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2409     uint8_t full[SIZE*(SIZE+5)];\
2410     uint8_t * const full_mid= full + SIZE*2;\
2411     uint8_t halfH[SIZE*SIZE];\
2412     uint8_t halfV[SIZE*SIZE];\
2413     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2414     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2415     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2416     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2417 }\
2418 \
2419 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2420     int16_t tmp[SIZE*(SIZE+5)];\
2421     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2422 }\
2423 \
2424 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2425     int16_t tmp[SIZE*(SIZE+5)];\
2426     uint8_t halfH[SIZE*SIZE];\
2427     uint8_t halfHV[SIZE*SIZE];\
2428     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2429     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2430     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2431 }\
2432 \
2433 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2434     int16_t tmp[SIZE*(SIZE+5)];\
2435     uint8_t halfH[SIZE*SIZE];\
2436     uint8_t halfHV[SIZE*SIZE];\
2437     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2438     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2439     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2440 }\
2441 \
2442 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2443     uint8_t full[SIZE*(SIZE+5)];\
2444     uint8_t * const full_mid= full + SIZE*2;\
2445     int16_t tmp[SIZE*(SIZE+5)];\
2446     uint8_t halfV[SIZE*SIZE];\
2447     uint8_t halfHV[SIZE*SIZE];\
2448     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2449     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2450     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2451     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2452 }\
2453 \
2454 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2455     uint8_t full[SIZE*(SIZE+5)];\
2456     uint8_t * const full_mid= full + SIZE*2;\
2457     int16_t tmp[SIZE*(SIZE+5)];\
2458     uint8_t halfV[SIZE*SIZE];\
2459     uint8_t halfHV[SIZE*SIZE];\
2460     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2461     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2462     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2463     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2464 }\
2465
2466 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2467 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2468 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2469 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2470 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2471
2472 H264_LOWPASS(put_       , op_put, op2_put)
2473 H264_LOWPASS(avg_       , op_avg, op2_avg)
2474 H264_MC(put_, 2)
2475 H264_MC(put_, 4)
2476 H264_MC(put_, 8)
2477 H264_MC(put_, 16)
2478 H264_MC(avg_, 4)
2479 H264_MC(avg_, 8)
2480 H264_MC(avg_, 16)
2481
2482 #undef op_avg
2483 #undef op_put
2484 #undef op2_avg
2485 #undef op2_put
2486 #endif
2487
2488 #define op_scale1(x)  block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2489 #define op_scale2(x)  dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2490 #define H264_WEIGHT(W,H) \
2491 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2492     int y; \
2493     offset <<= log2_denom; \
2494     if(log2_denom) offset += 1<<(log2_denom-1); \
2495     for(y=0; y<H; y++, block += stride){ \
2496         op_scale1(0); \
2497         op_scale1(1); \
2498         if(W==2) continue; \
2499         op_scale1(2); \
2500         op_scale1(3); \
2501         if(W==4) continue; \
2502         op_scale1(4); \
2503         op_scale1(5); \
2504         op_scale1(6); \
2505         op_scale1(7); \
2506         if(W==8) continue; \
2507         op_scale1(8); \
2508         op_scale1(9); \
2509         op_scale1(10); \
2510         op_scale1(11); \
2511         op_scale1(12); \
2512         op_scale1(13); \
2513         op_scale1(14); \
2514         op_scale1(15); \
2515     } \
2516 } \
2517 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2518     int y; \
2519     offset = ((offset + 1) | 1) << log2_denom; \
2520     for(y=0; y<H; y++, dst += stride, src += stride){ \
2521         op_scale2(0); \
2522         op_scale2(1); \
2523         if(W==2) continue; \
2524         op_scale2(2); \
2525         op_scale2(3); \
2526         if(W==4) continue; \
2527         op_scale2(4); \
2528         op_scale2(5); \
2529         op_scale2(6); \
2530         op_scale2(7); \
2531         if(W==8) continue; \
2532         op_scale2(8); \
2533         op_scale2(9); \
2534         op_scale2(10); \
2535         op_scale2(11); \
2536         op_scale2(12); \
2537         op_scale2(13); \
2538         op_scale2(14); \
2539         op_scale2(15); \
2540     } \
2541 }
2542
2543 H264_WEIGHT(16,16)
2544 H264_WEIGHT(16,8)
2545 H264_WEIGHT(8,16)
2546 H264_WEIGHT(8,8)
2547 H264_WEIGHT(8,4)
2548 H264_WEIGHT(4,8)
2549 H264_WEIGHT(4,4)
2550 H264_WEIGHT(4,2)
2551 H264_WEIGHT(2,4)
2552 H264_WEIGHT(2,2)
2553
2554 #undef op_scale1
2555 #undef op_scale2
2556 #undef H264_WEIGHT
2557
2558 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2559     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2560     int i;
2561
2562     for(i=0; i<h; i++){
2563         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2564         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2565         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2566         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2567         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2568         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2569         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2570         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2571         dst+=dstStride;
2572         src+=srcStride;
2573     }
2574 }
2575
2576 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2577     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2578     int i;
2579
2580     for(i=0; i<w; i++){
2581         const int src_1= src[ -srcStride];
2582         const int src0 = src[0          ];
2583         const int src1 = src[  srcStride];
2584         const int src2 = src[2*srcStride];
2585         const int src3 = src[3*srcStride];
2586         const int src4 = src[4*srcStride];
2587         const int src5 = src[5*srcStride];
2588         const int src6 = src[6*srcStride];
2589         const int src7 = src[7*srcStride];
2590         const int src8 = src[8*srcStride];
2591         const int src9 = src[9*srcStride];
2592         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2593         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2594         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2595         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2596         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2597         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2598         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2599         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2600         src++;
2601         dst++;
2602     }
2603 }
2604
2605 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2606     put_pixels8_c(dst, src, stride, 8);
2607 }
2608
2609 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2610     uint8_t half[64];
2611     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2612     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2613 }
2614
2615 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2616     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2617 }
2618
2619 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2620     uint8_t half[64];
2621     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2622     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2623 }
2624
2625 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2626     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2627 }
2628
2629 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2630     uint8_t halfH[88];
2631     uint8_t halfV[64];
2632     uint8_t halfHV[64];
2633     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2634     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2635     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2636     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2637 }
2638 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2639     uint8_t halfH[88];
2640     uint8_t halfV[64];
2641     uint8_t halfHV[64];
2642     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2643     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2644     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2645     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2646 }
2647 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2648     uint8_t halfH[88];
2649     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2650     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2651 }
2652
2653 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2654     int x;
2655     const int strength= ff_h263_loop_filter_strength[qscale];
2656
2657     for(x=0; x<8; x++){
2658         int d1, d2, ad1;
2659         int p0= src[x-2*stride];
2660         int p1= src[x-1*stride];
2661         int p2= src[x+0*stride];
2662         int p3= src[x+1*stride];
2663         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2664
2665         if     (d<-2*strength) d1= 0;
2666         else if(d<-  strength) d1=-2*strength - d;
2667         else if(d<   strength) d1= d;
2668         else if(d< 2*strength) d1= 2*strength - d;
2669         else                   d1= 0;
2670
2671         p1 += d1;
2672         p2 -= d1;
2673         if(p1&256) p1= ~(p1>>31);
2674         if(p2&256) p2= ~(p2>>31);
2675
2676         src[x-1*stride] = p1;
2677         src[x+0*stride] = p2;
2678
2679         ad1= ABS(d1)>>1;
2680
2681         d2= clip((p0-p3)/4, -ad1, ad1);
2682
2683         src[x-2*stride] = p0 - d2;
2684         src[x+  stride] = p3 + d2;
2685     }
2686 }
2687
2688 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2689     int y;
2690     const int strength= ff_h263_loop_filter_strength[qscale];
2691
2692     for(y=0; y<8; y++){
2693         int d1, d2, ad1;
2694         int p0= src[y*stride-2];
2695         int p1= src[y*stride-1];
2696         int p2= src[y*stride+0];
2697         int p3= src[y*stride+1];
2698         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2699
2700         if     (d<-2*strength) d1= 0;
2701         else if(d<-  strength) d1=-2*strength - d;
2702         else if(d<   strength) d1= d;
2703         else if(d< 2*strength) d1= 2*strength - d;
2704         else                   d1= 0;
2705
2706         p1 += d1;
2707         p2 -= d1;
2708         if(p1&256) p1= ~(p1>>31);
2709         if(p2&256) p2= ~(p2>>31);
2710
2711         src[y*stride-1] = p1;
2712         src[y*stride+0] = p2;
2713
2714         ad1= ABS(d1)>>1;
2715
2716         d2= clip((p0-p3)/4, -ad1, ad1);
2717
2718         src[y*stride-2] = p0 - d2;
2719         src[y*stride+1] = p3 + d2;
2720     }
2721 }
2722
2723 static void h261_loop_filter_c(uint8_t *src, int stride){
2724     int x,y,xy,yz;
2725     int temp[64];
2726
2727     for(x=0; x<8; x++){
2728         temp[x      ] = 4*src[x           ];
2729         temp[x + 7*8] = 4*src[x + 7*stride];
2730     }
2731     for(y=1; y<7; y++){
2732         for(x=0; x<8; x++){
2733             xy = y * stride + x;
2734             yz = y * 8 + x;
2735             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2736         }
2737     }
2738
2739     for(y=0; y<8; y++){
2740         src[  y*stride] = (temp[  y*8] + 2)>>2;
2741         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2742         for(x=1; x<7; x++){
2743             xy = y * stride + x;
2744             yz = y * 8 + x;
2745             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2746         }
2747     }
2748 }
2749
2750 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2751 {
2752     int i, d;
2753     for( i = 0; i < 4; i++ ) {
2754         if( tc0[i] < 0 ) {
2755             pix += 4*ystride;
2756             continue;
2757         }
2758         for( d = 0; d < 4; d++ ) {
2759             const int p0 = pix[-1*xstride];
2760             const int p1 = pix[-2*xstride];
2761             const int p2 = pix[-3*xstride];
2762             const int q0 = pix[0];
2763             const int q1 = pix[1*xstride];
2764             const int q2 = pix[2*xstride];
2765
2766             if( ABS( p0 - q0 ) < alpha &&
2767                 ABS( p1 - p0 ) < beta &&
2768                 ABS( q1 - q0 ) < beta ) {
2769
2770                 int tc = tc0[i];
2771                 int i_delta;
2772
2773                 if( ABS( p2 - p0 ) < beta ) {
2774                     pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2775                     tc++;
2776                 }
2777                 if( ABS( q2 - q0 ) < beta ) {
2778                     pix[   xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2779                     tc++;
2780                 }
2781
2782                 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2783                 pix[-xstride] = clip_uint8( p0 + i_delta );    /* p0' */
2784                 pix[0]        = clip_uint8( q0 - i_delta );    /* q0' */
2785             }
2786             pix += ystride;
2787         }
2788     }
2789 }
2790 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2791 {
2792     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2793 }
2794 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2795 {
2796     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2797 }
2798
2799 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2800 {
2801     int i, d;
2802     for( i = 0; i < 4; i++ ) {
2803         const int tc = tc0[i];
2804         if( tc <= 0 ) {
2805             pix += 2*ystride;
2806             continue;
2807         }
2808         for( d = 0; d < 2; d++ ) {
2809             const int p0 = pix[-1*xstride];
2810             const int p1 = pix[-2*xstride];
2811             const int q0 = pix[0];
2812             const int q1 = pix[1*xstride];
2813
2814             if( ABS( p0 - q0 ) < alpha &&
2815                 ABS( p1 - p0 ) < beta &&
2816                 ABS( q1 - q0 ) < beta ) {
2817
2818                 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2819
2820                 pix[-xstride] = clip_uint8( p0 + delta );    /* p0' */
2821                 pix[0]        = clip_uint8( q0 - delta );    /* q0' */
2822             }
2823             pix += ystride;
2824         }
2825     }
2826 }
2827 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2828 {
2829     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2830 }
2831 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2832 {
2833     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2834 }
2835
2836 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2837 {
2838     int d;
2839     for( d = 0; d < 8; d++ ) {
2840         const int p0 = pix[-1*xstride];
2841         const int p1 = pix[-2*xstride];
2842         const int q0 = pix[0];
2843         const int q1 = pix[1*xstride];
2844
2845         if( ABS( p0 - q0 ) < alpha &&
2846             ABS( p1 - p0 ) < beta &&
2847             ABS( q1 - q0 ) < beta ) {
2848
2849             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2850             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2851         }
2852         pix += ystride;
2853     }
2854 }
2855 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2856 {
2857     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2858 }
2859 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2860 {
2861     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2862 }
2863
2864 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2865 {
2866     int s, i;
2867
2868     s = 0;
2869     for(i=0;i<h;i++) {
2870         s += abs(pix1[0] - pix2[0]);
2871         s += abs(pix1[1] - pix2[1]);
2872         s += abs(pix1[2] - pix2[2]);
2873         s += abs(pix1[3] - pix2[3]);
2874         s += abs(pix1[4] - pix2[4]);
2875         s += abs(pix1[5] - pix2[5]);
2876         s += abs(pix1[6] - pix2[6]);
2877         s += abs(pix1[7] - pix2[7]);
2878         s += abs(pix1[8] - pix2[8]);
2879         s += abs(pix1[9] - pix2[9]);
2880         s += abs(pix1[10] - pix2[10]);
2881         s += abs(pix1[11] - pix2[11]);
2882         s += abs(pix1[12] - pix2[12]);
2883         s += abs(pix1[13] - pix2[13]);
2884         s += abs(pix1[14] - pix2[14]);
2885         s += abs(pix1[15] - pix2[15]);
2886         pix1 += line_size;
2887         pix2 += line_size;
2888     }
2889     return s;
2890 }
2891
2892 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2893 {
2894     int s, i;
2895
2896     s = 0;
2897     for(i=0;i<h;i++) {
2898         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2899         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2900         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2901         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2902         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2903         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2904         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2905         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2906         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2907         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2908         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2909         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2910         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2911         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2912         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2913         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2914         pix1 += line_size;
2915         pix2 += line_size;
2916     }
2917     return s;
2918 }
2919
2920 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2921 {
2922     int s, i;
2923     uint8_t *pix3 = pix2 + line_size;
2924
2925     s = 0;
2926     for(i=0;i<h;i++) {
2927         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2928         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2929         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2930         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2931         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2932         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2933         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2934         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2935         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2936         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2937         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2938         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2939         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2940         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2941         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2942         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2943         pix1 += line_size;
2944         pix2 += line_size;
2945         pix3 += line_size;
2946     }
2947     return s;
2948 }
2949
2950 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2951 {
2952     int s, i;
2953     uint8_t *pix3 = pix2 + line_size;
2954
2955     s = 0;
2956     for(i=0;i<h;i++) {
2957         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2958         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2959         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2960         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2961         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2962         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2963         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2964         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2965         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2966         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2967         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2968         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2969         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2970         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2971         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2972         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2973         pix1 += line_size;
2974         pix2 += line_size;
2975         pix3 += line_size;
2976     }
2977     return s;
2978 }
2979
2980 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2981 {
2982     int s, i;
2983
2984     s = 0;
2985     for(i=0;i<h;i++) {
2986         s += abs(pix1[0] - pix2[0]);
2987         s += abs(pix1[1] - pix2[1]);
2988         s += abs(pix1[2] - pix2[2]);
2989         s += abs(pix1[3] - pix2[3]);
2990         s += abs(pix1[4] - pix2[4]);
2991         s += abs(pix1[5] - pix2[5]);
2992         s += abs(pix1[6] - pix2[6]);
2993         s += abs(pix1[7] - pix2[7]);
2994         pix1 += line_size;
2995         pix2 += line_size;
2996     }
2997     return s;
2998 }
2999
3000 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3001 {
3002     int s, i;
3003
3004     s = 0;
3005     for(i=0;i<h;i++) {
3006         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3007         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3008         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3009         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3010         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3011         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3012         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3013         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3014         pix1 += line_size;
3015         pix2 += line_size;
3016     }
3017     return s;
3018 }
3019
3020 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3021 {
3022     int s, i;
3023     uint8_t *pix3 = pix2 + line_size;
3024
3025     s = 0;
3026     for(i=0;i<h;i++) {
3027         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3028         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3029         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3030         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3031         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3032         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3033         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3034         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3035         pix1 += line_size;
3036         pix2 += line_size;
3037         pix3 += line_size;
3038     }
3039     return s;
3040 }
3041
3042 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3043 {
3044     int s, i;
3045     uint8_t *pix3 = pix2 + line_size;
3046
3047     s = 0;
3048     for(i=0;i<h;i++) {
3049         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3050         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3051         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3052         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3053         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3054         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3055         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3056         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3057         pix1 += line_size;
3058         pix2 += line_size;
3059         pix3 += line_size;
3060     }
3061     return s;
3062 }
3063
3064 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3065     MpegEncContext *c = v;
3066     int score1=0;
3067     int score2=0;
3068     int x,y;
3069
3070     for(y=0; y<h; y++){
3071         for(x=0; x<16; x++){
3072             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3073         }
3074         if(y+1<h){
3075             for(x=0; x<15; x++){
3076                 score2+= ABS(  s1[x  ] - s1[x  +stride]
3077                              - s1[x+1] + s1[x+1+stride])
3078                         -ABS(  s2[x  ] - s2[x  +stride]
3079                              - s2[x+1] + s2[x+1+stride]);
3080             }
3081         }
3082         s1+= stride;
3083         s2+= stride;
3084     }
3085
3086     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3087     else  return score1 + ABS(score2)*8;
3088 }
3089
3090 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3091     MpegEncContext *c = v;
3092     int score1=0;
3093     int score2=0;
3094     int x,y;
3095
3096     for(y=0; y<h; y++){
3097         for(x=0; x<8; x++){
3098             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3099         }
3100         if(y+1<h){
3101             for(x=0; x<7; x++){
3102                 score2+= ABS(  s1[x  ] - s1[x  +stride]
3103                              - s1[x+1] + s1[x+1+stride])
3104                         -ABS(  s2[x  ] - s2[x  +stride]
3105                              - s2[x+1] + s2[x+1+stride]);
3106             }
3107         }
3108         s1+= stride;
3109         s2+= stride;
3110     }
3111
3112     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3113     else  return score1 + ABS(score2)*8;
3114 }
3115
3116 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3117     int i;
3118     unsigned int sum=0;
3119
3120     for(i=0; i<8*8; i++){
3121         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3122         int w= weight[i];
3123         b>>= RECON_SHIFT;
3124         assert(-512<b && b<512);
3125
3126         sum += (w*b)*(w*b)>>4;
3127     }
3128     return sum>>2;
3129 }
3130
3131 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3132     int i;
3133
3134     for(i=0; i<8*8; i++){
3135         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3136     }
3137 }
3138
3139 /**
3140  * permutes an 8x8 block.
3141  * @param block the block which will be permuted according to the given permutation vector
3142  * @param permutation the permutation vector
3143  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3144  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3145  *                  (inverse) permutated to scantable order!
3146  */
3147 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3148 {
3149     int i;
3150     DCTELEM temp[64];
3151
3152     if(last<=0) return;
3153     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3154
3155     for(i=0; i<=last; i++){
3156         const int j= scantable[i];
3157         temp[j]= block[j];
3158         block[j]=0;
3159     }
3160
3161     for(i=0; i<=last; i++){
3162         const int j= scantable[i];
3163         const int perm_j= permutation[j];
3164         block[perm_j]= temp[j];
3165     }
3166 }
3167
3168 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3169     return 0;
3170 }
3171
3172 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3173     int i;
3174
3175     memset(cmp, 0, sizeof(void*)*5);
3176
3177     for(i=0; i<5; i++){
3178         switch(type&0xFF){
3179         case FF_CMP_SAD:
3180             cmp[i]= c->sad[i];
3181             break;
3182         case FF_CMP_SATD:
3183             cmp[i]= c->hadamard8_diff[i];
3184             break;
3185         case FF_CMP_SSE:
3186             cmp[i]= c->sse[i];
3187             break;
3188         case FF_CMP_DCT:
3189             cmp[i]= c->dct_sad[i];
3190             break;
3191         case FF_CMP_DCT264:
3192             cmp[i]= c->dct264_sad[i];
3193             break;
3194         case FF_CMP_DCTMAX:
3195             cmp[i]= c->dct_max[i];
3196             break;
3197         case FF_CMP_PSNR:
3198             cmp[i]= c->quant_psnr[i];
3199             break;
3200         case FF_CMP_BIT:
3201             cmp[i]= c->bit[i];
3202             break;
3203         case FF_CMP_RD:
3204             cmp[i]= c->rd[i];
3205             break;
3206         case FF_CMP_VSAD:
3207             cmp[i]= c->vsad[i];
3208             break;
3209         case FF_CMP_VSSE:
3210             cmp[i]= c->vsse[i];
3211             break;
3212         case FF_CMP_ZERO:
3213             cmp[i]= zero_cmp;
3214             break;
3215         case FF_CMP_NSSE:
3216             cmp[i]= c->nsse[i];
3217             break;
3218         case FF_CMP_W53:
3219             cmp[i]= c->w53[i];
3220             break;
3221         case FF_CMP_W97:
3222             cmp[i]= c->w97[i];
3223             break;
3224         default:
3225             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3226         }
3227     }
3228 }
3229
3230 /**
3231  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3232  */
3233 static void clear_blocks_c(DCTELEM *blocks)
3234 {
3235     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3236 }
3237
3238 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3239     int i;
3240     for(i=0; i+7<w; i+=8){
3241         dst[i+0] += src[i+0];
3242         dst[i+1] += src[i+1];
3243         dst[i+2] += src[i+2];
3244         dst[i+3] += src[i+3];
3245         dst[i+4] += src[i+4];
3246         dst[i+5] += src[i+5];
3247         dst[i+6] += src[i+6];
3248         dst[i+7] += src[i+7];
3249     }
3250     for(; i<w; i++)
3251         dst[i+0] += src[i+0];
3252 }
3253
3254 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3255     int i;
3256     for(i=0; i+7<w; i+=8){
3257         dst[i+0] = src1[i+0]-src2[i+0];
3258         dst[i+1] = src1[i+1]-src2[i+1];
3259         dst[i+2] = src1[i+2]-src2[i+2];
3260         dst[i+3] = src1[i+3]-src2[i+3];
3261         dst[i+4] = src1[i+4]-src2[i+4];
3262         dst[i+5] = src1[i+5]-src2[i+5];
3263         dst[i+6] = src1[i+6]-src2[i+6];
3264         dst[i+7] = src1[i+7]-src2[i+7];
3265     }
3266     for(; i<w; i++)
3267         dst[i+0] = src1[i+0]-src2[i+0];
3268 }
3269
3270 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3271     int i;
3272     uint8_t l, lt;
3273
3274     l= *left;
3275     lt= *left_top;
3276
3277     for(i=0; i<w; i++){
3278         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3279         lt= src1[i];
3280         l= src2[i];
3281         dst[i]= l - pred;
3282     }
3283
3284     *left= l;
3285     *left_top= lt;
3286 }
3287
3288 #define BUTTERFLY2(o1,o2,i1,i2) \
3289 o1= (i1)+(i2);\
3290 o2= (i1)-(i2);
3291
3292 #define BUTTERFLY1(x,y) \
3293 {\
3294     int a,b;\
3295     a= x;\
3296     b= y;\
3297     x= a+b;\
3298     y= a-b;\
3299 }
3300
3301 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3302
3303 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3304     int i;
3305     int temp[64];
3306     int sum=0;
3307
3308     assert(h==8);
3309
3310     for(i=0; i<8; i++){
3311         //FIXME try pointer walks
3312         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3313         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3314         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3315         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3316
3317         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3318         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3319         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3320         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3321
3322         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3323         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3324         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3325         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3326     }
3327
3328     for(i=0; i<8; i++){
3329         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3330         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3331         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3332         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3333
3334         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3335         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3336         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3337         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3338
3339         sum +=
3340              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3341             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3342             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3343             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3344     }
3345 #if 0
3346 static int maxi=0;
3347 if(sum>maxi){
3348     maxi=sum;
3349     printf("MAX:%d\n", maxi);
3350 }
3351 #endif
3352     return sum;
3353 }
3354
3355 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3356     int i;
3357     int temp[64];
3358     int sum=0;
3359
3360     assert(h==8);
3361
3362     for(i=0; i<8; i++){
3363         //FIXME try pointer walks
3364         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3365         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3366         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3367         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3368
3369         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3370         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3371         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3372         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3373
3374         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3375         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3376         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3377         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3378     }
3379
3380     for(i=0; i<8; i++){
3381         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3382         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3383         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3384         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3385
3386         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3387         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3388         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3389         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3390
3391         sum +=
3392              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3393             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3394             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3395             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3396     }
3397
3398     sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3399
3400     return sum;
3401 }
3402
3403 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3404     MpegEncContext * const s= (MpegEncContext *)c;
3405     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3406     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3407     int sum=0, i;
3408
3409     assert(h==8);
3410
3411     s->dsp.diff_pixels(temp, src1, src2, stride);
3412     s->dsp.fdct(temp);
3413
3414     for(i=0; i<64; i++)
3415         sum+= ABS(temp[i]);
3416
3417     return sum;
3418 }
3419
3420 #ifdef CONFIG_GPL
3421 #define DCT8_1D {\
3422     const int s07 = SRC(0) + SRC(7);\
3423     const int s16 = SRC(1) + SRC(6);\
3424     const int s25 = SRC(2) + SRC(5);\
3425     const int s34 = SRC(3) + SRC(4);\
3426     const int a0 = s07 + s34;\
3427     const int a1 = s16 + s25;\
3428     const int a2 = s07 - s34;\
3429     const int a3 = s16 - s25;\
3430     const int d07 = SRC(0) - SRC(7);\
3431     const int d16 = SRC(1) - SRC(6);\
3432     const int d25 = SRC(2) - SRC(5);\
3433     const int d34 = SRC(3) - SRC(4);\
3434     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3435     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3436     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3437     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3438     DST(0,  a0 + a1     ) ;\
3439     DST(1,  a4 + (a7>>2)) ;\
3440     DST(2,  a2 + (a3>>1)) ;\
3441     DST(3,  a5 + (a6>>2)) ;\
3442     DST(4,  a0 - a1     ) ;\
3443     DST(5,  a6 - (a5>>2)) ;\
3444     DST(6, (a2>>1) - a3 ) ;\
3445     DST(7, (a4>>2) - a7 ) ;\
3446 }
3447
3448 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3449     MpegEncContext * const s= (MpegEncContext *)c;
3450     int16_t dct[8][8];
3451     int i;
3452     int sum=0;
3453
3454     s->dsp.diff_pixels(dct, src1, src2, stride);
3455
3456 #define SRC(x) dct[i][x]
3457 #define DST(x,v) dct[i][x]= v
3458     for( i = 0; i < 8; i++ )
3459         DCT8_1D
3460 #undef SRC
3461 #undef DST
3462
3463 #define SRC(x) dct[x][i]
3464 #define DST(x,v) sum += ABS(v)
3465     for( i = 0; i < 8; i++ )
3466         DCT8_1D
3467 #undef SRC
3468 #undef DST
3469     return sum;
3470 }
3471 #endif
3472
3473 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3474     MpegEncContext * const s= (MpegEncContext *)c;
3475     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3476     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3477     int sum=0, i;
3478
3479     assert(h==8);
3480
3481     s->dsp.diff_pixels(temp, src1, src2, stride);
3482     s->dsp.fdct(temp);
3483
3484     for(i=0; i<64; i++)
3485         sum= FFMAX(sum, ABS(temp[i]));
3486
3487     return sum;
3488 }
3489
3490 void simple_idct(DCTELEM *block); //FIXME
3491
3492 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3493     MpegEncContext * const s= (MpegEncContext *)c;
3494     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3495     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3496     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3497     int sum=0, i;
3498
3499     assert(h==8);
3500     s->mb_intra=0;
3501
3502     s->dsp.diff_pixels(temp, src1, src2, stride);
3503
3504     memcpy(bak, temp, 64*sizeof(DCTELEM));
3505
3506     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3507     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3508     simple_idct(temp); //FIXME
3509
3510     for(i=0; i<64; i++)
3511         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3512
3513     return sum;
3514 }
3515
3516 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3517     MpegEncContext * const s= (MpegEncContext *)c;
3518     const uint8_t *scantable= s->intra_scantable.permutated;
3519     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3520     uint64_t __align8 aligned_bak[stride];
3521     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3522     uint8_t * const bak= (uint8_t*)aligned_bak;
3523     int i, last, run, bits, level, distoration, start_i;
3524     const int esc_length= s->ac_esc_length;
3525     uint8_t * length;
3526     uint8_t * last_length;
3527
3528     assert(h==8);
3529
3530     for(i=0; i<8; i++){
3531         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3532         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3533     }
3534
3535     s->dsp.diff_pixels(temp, src1, src2, stride);
3536
3537     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3538
3539     bits=0;
3540
3541     if (s->mb_intra) {
3542         start_i = 1;
3543         length     = s->intra_ac_vlc_length;
3544         last_length= s->intra_ac_vlc_last_length;
3545         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3546     } else {
3547         start_i = 0;
3548         length     = s->inter_ac_vlc_length;
3549         last_length= s->inter_ac_vlc_last_length;
3550     }
3551
3552     if(last>=start_i){
3553         run=0;
3554         for(i=start_i; i<last; i++){
3555             int j= scantable[i];
3556             level= temp[j];
3557
3558             if(level){
3559                 level+=64;
3560                 if((level&(~127)) == 0){
3561                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3562                 }else
3563                     bits+= esc_length;
3564                 run=0;
3565             }else
3566                 run++;
3567         }
3568         i= scantable[last];
3569
3570         level= temp[i] + 64;
3571
3572         assert(level - 64);
3573
3574         if((level&(~127)) == 0){
3575             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3576         }else
3577             bits+= esc_length;
3578
3579     }
3580
3581     if(last>=0){
3582         if(s->mb_intra)
3583             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3584         else
3585             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3586     }
3587
3588     s->dsp.idct_add(bak, stride, temp);
3589
3590     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3591
3592     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3593 }
3594
3595 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3596     MpegEncContext * const s= (MpegEncContext *)c;
3597     const uint8_t *scantable= s->intra_scantable.permutated;
3598     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3599     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3600     int i, last, run, bits, level, start_i;
3601     const int esc_length= s->ac_esc_length;
3602     uint8_t * length;
3603     uint8_t * last_length;
3604
3605     assert(h==8);
3606
3607     s->dsp.diff_pixels(temp, src1, src2, stride);
3608
3609     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3610
3611     bits=0;
3612
3613     if (s->mb_intra) {
3614         start_i = 1;
3615         length     = s->intra_ac_vlc_length;
3616         last_length= s->intra_ac_vlc_last_length;
3617         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3618     } else {
3619         start_i = 0;
3620         length     = s->inter_ac_vlc_length;
3621         last_length= s->inter_ac_vlc_last_length;
3622     }
3623
3624     if(last>=start_i){
3625         run=0;
3626         for(i=start_i; i<last; i++){
3627             int j= scantable[i];
3628             level= temp[j];
3629
3630             if(level){
3631                 level+=64;
3632                 if((level&(~127)) == 0){
3633                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3634                 }else
3635                     bits+= esc_length;
3636                 run=0;
3637             }else
3638                 run++;
3639         }
3640         i= scantable[last];
3641
3642         level= temp[i] + 64;
3643
3644         assert(level - 64);
3645
3646         if((level&(~127)) == 0){
3647             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3648         }else
3649             bits+= esc_length;
3650     }
3651
3652     return bits;
3653 }
3654
3655 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3656     int score=0;
3657     int x,y;
3658
3659     for(y=1; y<h; y++){
3660         for(x=0; x<16; x+=4){
3661             score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride])
3662                    +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3663         }
3664         s+= stride;
3665     }
3666
3667     return score;
3668 }
3669
3670 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3671     int score=0;
3672     int x,y;
3673
3674     for(y=1; y<h; y++){
3675         for(x=0; x<16; x++){
3676             score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3677         }
3678         s1+= stride;
3679         s2+= stride;
3680     }
3681
3682     return score;
3683 }
3684
3685 #define SQ(a) ((a)*(a))
3686 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3687     int score=0;
3688     int x,y;
3689
3690     for(y=1; y<h; y++){
3691         for(x=0; x<16; x+=4){
3692             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3693                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3694         }
3695         s+= stride;
3696     }
3697
3698     return score;
3699 }
3700
3701 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3702     int score=0;
3703     int x,y;
3704
3705     for(y=1; y<h; y++){
3706         for(x=0; x<16; x++){
3707             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3708         }
3709         s1+= stride;
3710         s2+= stride;
3711     }
3712
3713     return score;
3714 }
3715
3716 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3717 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3718 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3719 #ifdef CONFIG_GPL
3720 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3721 #endif
3722 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3723 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3724 WARPER8_16_SQ(rd8x8_c, rd16_c)
3725 WARPER8_16_SQ(bit8x8_c, bit16_c)
3726
3727 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3728  converted */
3729 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3730 {
3731     j_rev_dct (block);
3732     put_pixels_clamped_c(block, dest, line_size);
3733 }
3734 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3735 {
3736     j_rev_dct (block);
3737     add_pixels_clamped_c(block, dest, line_size);
3738 }
3739
3740 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3741 {
3742     j_rev_dct4 (block);
3743     put_pixels_clamped4_c(block, dest, line_size);
3744 }
3745 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3746 {
3747     j_rev_dct4 (block);
3748     add_pixels_clamped4_c(block, dest, line_size);
3749 }
3750
3751 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3752 {
3753     j_rev_dct2 (block);
3754     put_pixels_clamped2_c(block, dest, line_size);
3755 }
3756 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3757 {
3758     j_rev_dct2 (block);
3759     add_pixels_clamped2_c(block, dest, line_size);
3760 }
3761
3762 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3763 {
3764     uint8_t *cm = cropTbl + MAX_NEG_CROP;
3765
3766     dest[0] = cm[(block[0] + 4)>>3];
3767 }
3768 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3769 {
3770     uint8_t *cm = cropTbl + MAX_NEG_CROP;
3771
3772     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3773 }
3774
3775 /* init static data */
3776 void dsputil_static_init(void)
3777 {
3778     int i;
3779
3780     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3781     for(i=0;i<MAX_NEG_CROP;i++) {
3782         cropTbl[i] = 0;
3783         cropTbl[i + MAX_NEG_CROP + 256] = 255;
3784     }
3785
3786     for(i=0;i<512;i++) {
3787         squareTbl[i] = (i - 256) * (i - 256);
3788     }
3789
3790     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3791 }
3792
3793
3794 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3795 {
3796     int i;
3797
3798 #ifdef CONFIG_ENCODERS
3799     if(avctx->dct_algo==FF_DCT_FASTINT) {
3800         c->fdct = fdct_ifast;
3801         c->fdct248 = fdct_ifast248;
3802     }
3803     else if(avctx->dct_algo==FF_DCT_FAAN) {
3804         c->fdct = ff_faandct;
3805         c->fdct248 = ff_faandct248;
3806     }
3807     else {
3808         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3809         c->fdct248 = ff_fdct248_islow;
3810     }
3811 #endif //CONFIG_ENCODERS
3812
3813     if(avctx->lowres==1){
3814         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3815             c->idct_put= ff_jref_idct4_put;
3816             c->idct_add= ff_jref_idct4_add;
3817         }else{
3818             c->idct_put= ff_h264_lowres_idct_put_c;
3819             c->idct_add= ff_h264_lowres_idct_add_c;
3820         }
3821         c->idct    = j_rev_dct4;
3822         c->idct_permutation_type= FF_NO_IDCT_PERM;
3823     }else if(avctx->lowres==2){
3824         c->idct_put= ff_jref_idct2_put;
3825         c->idct_add= ff_jref_idct2_add;
3826         c->idct    = j_rev_dct2;
3827         c->idct_permutation_type= FF_NO_IDCT_PERM;
3828     }else if(avctx->lowres==3){
3829         c->idct_put= ff_jref_idct1_put;
3830         c->idct_add= ff_jref_idct1_add;
3831         c->idct    = j_rev_dct1;
3832         c->idct_permutation_type= FF_NO_IDCT_PERM;
3833     }else{
3834         if(avctx->idct_algo==FF_IDCT_INT){
3835             c->idct_put= ff_jref_idct_put;
3836             c->idct_add= ff_jref_idct_add;
3837             c->idct    = j_rev_dct;
3838             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3839         }else if(avctx->idct_algo==FF_IDCT_VP3){
3840             c->idct_put= ff_vp3_idct_put_c;
3841             c->idct_add= ff_vp3_idct_add_c;
3842             c->idct    = ff_vp3_idct_c;
3843             c->idct_permutation_type= FF_NO_IDCT_PERM;
3844         }else{ //accurate/default
3845             c->idct_put= simple_idct_put;
3846             c->idct_add= simple_idct_add;
3847             c->idct    = simple_idct;
3848             c->idct_permutation_type= FF_NO_IDCT_PERM;
3849         }
3850     }
3851
3852     c->h264_idct_add= ff_h264_idct_add_c;
3853     c->h264_idct8_add= ff_h264_idct8_add_c;
3854
3855     c->get_pixels = get_pixels_c;
3856     c->diff_pixels = diff_pixels_c;
3857     c->put_pixels_clamped = put_pixels_clamped_c;
3858     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3859     c->add_pixels_clamped = add_pixels_clamped_c;
3860     c->add_pixels8 = add_pixels8_c;
3861     c->add_pixels4 = add_pixels4_c;
3862     c->gmc1 = gmc1_c;
3863     c->gmc = gmc_c;
3864     c->clear_blocks = clear_blocks_c;
3865     c->pix_sum = pix_sum_c;
3866     c->pix_norm1 = pix_norm1_c;
3867
3868     /* TODO [0] 16  [1] 8 */
3869     c->pix_abs[0][0] = pix_abs16_c;
3870     c->pix_abs[0][1] = pix_abs16_x2_c;
3871     c->pix_abs[0][2] = pix_abs16_y2_c;
3872     c->pix_abs[0][3] = pix_abs16_xy2_c;
3873     c->pix_abs[1][0] = pix_abs8_c;
3874     c->pix_abs[1][1] = pix_abs8_x2_c;
3875     c->pix_abs[1][2] = pix_abs8_y2_c;
3876     c->pix_abs[1][3] = pix_abs8_xy2_c;
3877
3878 #define dspfunc(PFX, IDX, NUM) \
3879     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3880     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3881     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3882     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3883
3884     dspfunc(put, 0, 16);
3885     dspfunc(put_no_rnd, 0, 16);
3886     dspfunc(put, 1, 8);
3887     dspfunc(put_no_rnd, 1, 8);
3888     dspfunc(put, 2, 4);
3889     dspfunc(put, 3, 2);
3890
3891     dspfunc(avg, 0, 16);
3892     dspfunc(avg_no_rnd, 0, 16);
3893     dspfunc(avg, 1, 8);
3894     dspfunc(avg_no_rnd, 1, 8);
3895     dspfunc(avg, 2, 4);
3896     dspfunc(avg, 3, 2);
3897 #undef dspfunc
3898
3899     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3900     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3901
3902     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3903     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3904     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3905     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3906     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3907     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3908     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3909     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3910     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3911
3912     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3913     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3914     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3915     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3916     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3917     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3918     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3919     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3920     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3921
3922 #define dspfunc(PFX, IDX, NUM) \
3923     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3924     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3925     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3926     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3927     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3928     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3929     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3930     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3931     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3932     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3933     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3934     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3935     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3936     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3937     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3938     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3939
3940     dspfunc(put_qpel, 0, 16);
3941     dspfunc(put_no_rnd_qpel, 0, 16);
3942
3943     dspfunc(avg_qpel, 0, 16);
3944     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3945
3946     dspfunc(put_qpel, 1, 8);
3947     dspfunc(put_no_rnd_qpel, 1, 8);
3948
3949     dspfunc(avg_qpel, 1, 8);
3950     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3951
3952     dspfunc(put_h264_qpel, 0, 16);
3953     dspfunc(put_h264_qpel, 1, 8);
3954     dspfunc(put_h264_qpel, 2, 4);
3955     dspfunc(put_h264_qpel, 3, 2);
3956     dspfunc(avg_h264_qpel, 0, 16);
3957     dspfunc(avg_h264_qpel, 1, 8);
3958     dspfunc(avg_h264_qpel, 2, 4);
3959
3960 #undef dspfunc
3961     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3962     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3963     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3964     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3965     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3966     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3967
3968     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3969     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3970     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3971     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3972     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3973     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3974     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3975     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3976     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3977     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3978     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3979     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3980     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3981     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3982     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3983     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3984     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3985     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3986     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3987     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3988
3989     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3990     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3991     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3992     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3993     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3994     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3995     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3996     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3997
3998 #define SET_CMP_FUNC(name) \
3999     c->name[0]= name ## 16_c;\
4000     c->name[1]= name ## 8x8_c;
4001
4002     SET_CMP_FUNC(hadamard8_diff)
4003     c->hadamard8_diff[4]= hadamard8_intra16_c;
4004     SET_CMP_FUNC(dct_sad)
4005     SET_CMP_FUNC(dct_max)
4006 #ifdef CONFIG_GPL
4007     SET_CMP_FUNC(dct264_sad)
4008 #endif
4009     c->sad[0]= pix_abs16_c;
4010     c->sad[1]= pix_abs8_c;
4011     c->sse[0]= sse16_c;
4012     c->sse[1]= sse8_c;
4013     c->sse[2]= sse4_c;
4014     SET_CMP_FUNC(quant_psnr)
4015     SET_CMP_FUNC(rd)
4016     SET_CMP_FUNC(bit)
4017     c->vsad[0]= vsad16_c;
4018     c->vsad[4]= vsad_intra16_c;
4019     c->vsse[0]= vsse16_c;
4020     c->vsse[4]= vsse_intra16_c;
4021     c->nsse[0]= nsse16_c;
4022     c->nsse[1]= nsse8_c;
4023     c->w53[0]= w53_16_c;
4024     c->w53[1]= w53_8_c;
4025     c->w97[0]= w97_16_c;
4026     c->w97[1]= w97_8_c;
4027
4028     c->add_bytes= add_bytes_c;
4029     c->diff_bytes= diff_bytes_c;
4030     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4031     c->bswap_buf= bswap_buf;
4032
4033     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4034     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4035     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4036     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4037     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4038     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4039
4040     c->h263_h_loop_filter= h263_h_loop_filter_c;
4041     c->h263_v_loop_filter= h263_v_loop_filter_c;
4042
4043     c->h261_loop_filter= h261_loop_filter_c;
4044
4045     c->try_8x8basis= try_8x8basis_c;
4046     c->add_8x8basis= add_8x8basis_c;
4047
4048 #ifdef HAVE_MMX
4049     dsputil_init_mmx(c, avctx);
4050 #endif
4051 #ifdef ARCH_ARMV4L
4052     dsputil_init_armv4l(c, avctx);
4053 #endif
4054 #ifdef HAVE_MLIB
4055     dsputil_init_mlib(c, avctx);
4056 #endif
4057 #ifdef ARCH_SPARC
4058    dsputil_init_vis(c,avctx);
4059 #endif
4060 #ifdef ARCH_ALPHA
4061     dsputil_init_alpha(c, avctx);
4062 #endif
4063 #ifdef ARCH_POWERPC
4064     dsputil_init_ppc(c, avctx);
4065 #endif
4066 #ifdef HAVE_MMI
4067     dsputil_init_mmi(c, avctx);
4068 #endif
4069 #ifdef ARCH_SH4
4070     dsputil_init_sh4(c,avctx);
4071 #endif
4072
4073     switch(c->idct_permutation_type){
4074     case FF_NO_IDCT_PERM:
4075         for(i=0; i<64; i++)
4076             c->idct_permutation[i]= i;
4077         break;
4078     case FF_LIBMPEG2_IDCT_PERM:
4079         for(i=0; i<64; i++)
4080             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4081         break;
4082     case FF_SIMPLE_IDCT_PERM:
4083         for(i=0; i<64; i++)
4084             c->idct_permutation[i]= simple_mmx_permutation[i];
4085         break;
4086     case FF_TRANSPOSE_IDCT_PERM:
4087         for(i=0; i<64; i++)
4088             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4089         break;
4090     case FF_PARTTRANS_IDCT_PERM:
4091         for(i=0; i<64; i++)
4092             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4093         break;
4094     default:
4095         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4096     }
4097 }
4098