git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  *
  20  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  21  */
  22
  23 /**
  24  * @file dsputil.c
  25  * DSP utils
  26  */
  27
  28 #include "avcodec.h"
  29 #include "dsputil.h"
  30 #include "mpegvideo.h"
  31 #include "simple_idct.h"
  32 #include "faandct.h"
  33
  34 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
  35 uint32_t squareTbl[512];
  36
  37 const uint8_t ff_zigzag_direct[64] = {
  38     0,   1,  8, 16,  9,  2,  3, 10,
  39     17, 24, 32, 25, 18, 11,  4,  5,
  40     12, 19, 26, 33, 40, 48, 41, 34,
  41     27, 20, 13,  6,  7, 14, 21, 28,
  42     35, 42, 49, 56, 57, 50, 43, 36,
  43     29, 22, 15, 23, 30, 37, 44, 51,
  44     58, 59, 52, 45, 38, 31, 39, 46,
  45     53, 60, 61, 54, 47, 55, 62, 63
  46 };
  47
  48 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  49    specification, we interleave the fields */
  50 const uint8_t ff_zigzag248_direct[64] = {
  51      0,  8,  1,  9, 16, 24,  2, 10,
  52     17, 25, 32, 40, 48, 56, 33, 41,
  53     18, 26,  3, 11,  4, 12, 19, 27,
  54     34, 42, 49, 57, 50, 58, 35, 43,
  55     20, 28,  5, 13,  6, 14, 21, 29,
  56     36, 44, 51, 59, 52, 60, 37, 45,
  57     22, 30,  7, 15, 23, 31, 38, 46,
  58     53, 61, 54, 62, 39, 47, 55, 63,
  59 };
  60
  61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  62 uint16_t __align8 inv_zigzag_direct16[64];
  63
  64 const uint8_t ff_alternate_horizontal_scan[64] = {
  65     0,  1,   2,  3,  8,  9, 16, 17,
  66     10, 11,  4,  5,  6,  7, 15, 14,
  67     13, 12, 19, 18, 24, 25, 32, 33,
  68     26, 27, 20, 21, 22, 23, 28, 29,
  69     30, 31, 34, 35, 40, 41, 48, 49,
  70     42, 43, 36, 37, 38, 39, 44, 45,
  71     46, 47, 50, 51, 56, 57, 58, 59,
  72     52, 53, 54, 55, 60, 61, 62, 63,
  73 };
  74
  75 const uint8_t ff_alternate_vertical_scan[64] = {
  76     0,  8,  16, 24,  1,  9,  2, 10,
  77     17, 25, 32, 40, 48, 56, 57, 49,
  78     41, 33, 26, 18,  3, 11,  4, 12,
  79     19, 27, 34, 42, 50, 58, 35, 43,
  80     51, 59, 20, 28,  5, 13,  6, 14,
  81     21, 29, 36, 44, 52, 60, 37, 45,
  82     53, 61, 22, 30,  7, 15, 23, 31,
  83     38, 46, 54, 62, 39, 47, 55, 63,
  84 };
  85
  86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  87 const uint32_t inverse[256]={
  88          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  89  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
  90  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
  91  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
  92  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
  93  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
  94   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
  95   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
  96   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
  97   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
  98   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
  99   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 100   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 101   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 102   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 103   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 104   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 105   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 106   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 107   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 108   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 109   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 110   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 111   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 112   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 113   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 114   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 115   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 116   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 117   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 118   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 119   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 120 };
 121
 122 /* Input permutation for the simple_idct_mmx */
 123 static const uint8_t simple_mmx_permutation[64]={
 124         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 125         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 126         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 127         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 128         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 129         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 130         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 131         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 132 };
 133
 134 static int pix_sum_c(uint8_t * pix, int line_size)
 135 {
 136     int s, i, j;
 137
 138     s = 0;
 139     for (i = 0; i < 16; i++) {
 140         for (j = 0; j < 16; j += 8) {
 141             s += pix[0];
 142             s += pix[1];
 143             s += pix[2];
 144             s += pix[3];
 145             s += pix[4];
 146             s += pix[5];
 147             s += pix[6];
 148             s += pix[7];
 149             pix += 8;
 150         }
 151         pix += line_size - 16;
 152     }
 153     return s;
 154 }
 155
 156 static int pix_norm1_c(uint8_t * pix, int line_size)
 157 {
 158     int s, i, j;
 159     uint32_t *sq = squareTbl + 256;
 160
 161     s = 0;
 162     for (i = 0; i < 16; i++) {
 163         for (j = 0; j < 16; j += 8) {
 164 #if 0
 165             s += sq[pix[0]];
 166             s += sq[pix[1]];
 167             s += sq[pix[2]];
 168             s += sq[pix[3]];
 169             s += sq[pix[4]];
 170             s += sq[pix[5]];
 171             s += sq[pix[6]];
 172             s += sq[pix[7]];
 173 #else
 174 #if LONG_MAX > 2147483647
 175             register uint64_t x=*(uint64_t*)pix;
 176             s += sq[x&0xff];
 177             s += sq[(x>>8)&0xff];
 178             s += sq[(x>>16)&0xff];
 179             s += sq[(x>>24)&0xff];
 180             s += sq[(x>>32)&0xff];
 181             s += sq[(x>>40)&0xff];
 182             s += sq[(x>>48)&0xff];
 183             s += sq[(x>>56)&0xff];
 184 #else
 185             register uint32_t x=*(uint32_t*)pix;
 186             s += sq[x&0xff];
 187             s += sq[(x>>8)&0xff];
 188             s += sq[(x>>16)&0xff];
 189             s += sq[(x>>24)&0xff];
 190             x=*(uint32_t*)(pix+4);
 191             s += sq[x&0xff];
 192             s += sq[(x>>8)&0xff];
 193             s += sq[(x>>16)&0xff];
 194             s += sq[(x>>24)&0xff];
 195 #endif
 196 #endif
 197             pix += 8;
 198         }
 199         pix += line_size - 16;
 200     }
 201     return s;
 202 }
 203
 204 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 205     int i;
 206
 207     for(i=0; i+8<=w; i+=8){
 208         dst[i+0]= bswap_32(src[i+0]);
 209         dst[i+1]= bswap_32(src[i+1]);
 210         dst[i+2]= bswap_32(src[i+2]);
 211         dst[i+3]= bswap_32(src[i+3]);
 212         dst[i+4]= bswap_32(src[i+4]);
 213         dst[i+5]= bswap_32(src[i+5]);
 214         dst[i+6]= bswap_32(src[i+6]);
 215         dst[i+7]= bswap_32(src[i+7]);
 216     }
 217     for(;i<w; i++){
 218         dst[i+0]= bswap_32(src[i+0]);
 219     }
 220 }
 221
 222 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 223 {
 224     int s, i;
 225     uint32_t *sq = squareTbl + 256;
 226
 227     s = 0;
 228     for (i = 0; i < h; i++) {
 229         s += sq[pix1[0] - pix2[0]];
 230         s += sq[pix1[1] - pix2[1]];
 231         s += sq[pix1[2] - pix2[2]];
 232         s += sq[pix1[3] - pix2[3]];
 233         s += sq[pix1[4] - pix2[4]];
 234         s += sq[pix1[5] - pix2[5]];
 235         s += sq[pix1[6] - pix2[6]];
 236         s += sq[pix1[7] - pix2[7]];
 237         pix1 += line_size;
 238         pix2 += line_size;
 239     }
 240     return s;
 241 }
 242
 243 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 244 {
 245     int s, i;
 246     uint32_t *sq = squareTbl + 256;
 247
 248     s = 0;
 249     for (i = 0; i < h; i++) {
 250         s += sq[pix1[ 0] - pix2[ 0]];
 251         s += sq[pix1[ 1] - pix2[ 1]];
 252         s += sq[pix1[ 2] - pix2[ 2]];
 253         s += sq[pix1[ 3] - pix2[ 3]];
 254         s += sq[pix1[ 4] - pix2[ 4]];
 255         s += sq[pix1[ 5] - pix2[ 5]];
 256         s += sq[pix1[ 6] - pix2[ 6]];
 257         s += sq[pix1[ 7] - pix2[ 7]];
 258         s += sq[pix1[ 8] - pix2[ 8]];
 259         s += sq[pix1[ 9] - pix2[ 9]];
 260         s += sq[pix1[10] - pix2[10]];
 261         s += sq[pix1[11] - pix2[11]];
 262         s += sq[pix1[12] - pix2[12]];
 263         s += sq[pix1[13] - pix2[13]];
 264         s += sq[pix1[14] - pix2[14]];
 265         s += sq[pix1[15] - pix2[15]];
 266
 267         pix1 += line_size;
 268         pix2 += line_size;
 269     }
 270     return s;
 271 }
 272
 273 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 274 {
 275     int i;
 276
 277     /* read the pixels */
 278     for(i=0;i<8;i++) {
 279         block[0] = pixels[0];
 280         block[1] = pixels[1];
 281         block[2] = pixels[2];
 282         block[3] = pixels[3];
 283         block[4] = pixels[4];
 284         block[5] = pixels[5];
 285         block[6] = pixels[6];
 286         block[7] = pixels[7];
 287         pixels += line_size;
 288         block += 8;
 289     }
 290 }
 291
 292 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 293                           const uint8_t *s2, int stride){
 294     int i;
 295
 296     /* read the pixels */
 297     for(i=0;i<8;i++) {
 298         block[0] = s1[0] - s2[0];
 299         block[1] = s1[1] - s2[1];
 300         block[2] = s1[2] - s2[2];
 301         block[3] = s1[3] - s2[3];
 302         block[4] = s1[4] - s2[4];
 303         block[5] = s1[5] - s2[5];
 304         block[6] = s1[6] - s2[6];
 305         block[7] = s1[7] - s2[7];
 306         s1 += stride;
 307         s2 += stride;
 308         block += 8;
 309     }
 310 }
 311
 312
 313 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 314                                  int line_size)
 315 {
 316     int i;
 317     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 318
 319     /* read the pixels */
 320     for(i=0;i<8;i++) {
 321         pixels[0] = cm[block[0]];
 322         pixels[1] = cm[block[1]];
 323         pixels[2] = cm[block[2]];
 324         pixels[3] = cm[block[3]];
 325         pixels[4] = cm[block[4]];
 326         pixels[5] = cm[block[5]];
 327         pixels[6] = cm[block[6]];
 328         pixels[7] = cm[block[7]];
 329
 330         pixels += line_size;
 331         block += 8;
 332     }
 333 }
 334
 335 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 336                           int line_size)
 337 {
 338     int i;
 339     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 340
 341     /* read the pixels */
 342     for(i=0;i<8;i++) {
 343         pixels[0] = cm[pixels[0] + block[0]];
 344         pixels[1] = cm[pixels[1] + block[1]];
 345         pixels[2] = cm[pixels[2] + block[2]];
 346         pixels[3] = cm[pixels[3] + block[3]];
 347         pixels[4] = cm[pixels[4] + block[4]];
 348         pixels[5] = cm[pixels[5] + block[5]];
 349         pixels[6] = cm[pixels[6] + block[6]];
 350         pixels[7] = cm[pixels[7] + block[7]];
 351         pixels += line_size;
 352         block += 8;
 353     }
 354 }
 355 #if 0
 356
 357 #define PIXOP2(OPNAME, OP) \
 358 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 359 {\
 360     int i;\
 361     for(i=0; i<h; i++){\
 362         OP(*((uint64_t*)block), LD64(pixels));\
 363         pixels+=line_size;\
 364         block +=line_size;\
 365     }\
 366 }\
 367 \
 368 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 369 {\
 370     int i;\
 371     for(i=0; i<h; i++){\
 372         const uint64_t a= LD64(pixels  );\
 373         const uint64_t b= LD64(pixels+1);\
 374         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 375         pixels+=line_size;\
 376         block +=line_size;\
 377     }\
 378 }\
 379 \
 380 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 381 {\
 382     int i;\
 383     for(i=0; i<h; i++){\
 384         const uint64_t a= LD64(pixels  );\
 385         const uint64_t b= LD64(pixels+1);\
 386         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 387         pixels+=line_size;\
 388         block +=line_size;\
 389     }\
 390 }\
 391 \
 392 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 393 {\
 394     int i;\
 395     for(i=0; i<h; i++){\
 396         const uint64_t a= LD64(pixels          );\
 397         const uint64_t b= LD64(pixels+line_size);\
 398         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 399         pixels+=line_size;\
 400         block +=line_size;\
 401     }\
 402 }\
 403 \
 404 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 405 {\
 406     int i;\
 407     for(i=0; i<h; i++){\
 408         const uint64_t a= LD64(pixels          );\
 409         const uint64_t b= LD64(pixels+line_size);\
 410         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 411         pixels+=line_size;\
 412         block +=line_size;\
 413     }\
 414 }\
 415 \
 416 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 417 {\
 418         int i;\
 419         const uint64_t a= LD64(pixels  );\
 420         const uint64_t b= LD64(pixels+1);\
 421         uint64_t l0=  (a&0x0303030303030303ULL)\
 422                     + (b&0x0303030303030303ULL)\
 423                     + 0x0202020202020202ULL;\
 424         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 425                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 426         uint64_t l1,h1;\
 427 \
 428         pixels+=line_size;\
 429         for(i=0; i<h; i+=2){\
 430             uint64_t a= LD64(pixels  );\
 431             uint64_t b= LD64(pixels+1);\
 432             l1=  (a&0x0303030303030303ULL)\
 433                + (b&0x0303030303030303ULL);\
 434             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 435               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 436             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 437             pixels+=line_size;\
 438             block +=line_size;\
 439             a= LD64(pixels  );\
 440             b= LD64(pixels+1);\
 441             l0=  (a&0x0303030303030303ULL)\
 442                + (b&0x0303030303030303ULL)\
 443                + 0x0202020202020202ULL;\
 444             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 445               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 446             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 447             pixels+=line_size;\
 448             block +=line_size;\
 449         }\
 450 }\
 451 \
 452 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 453 {\
 454         int i;\
 455         const uint64_t a= LD64(pixels  );\
 456         const uint64_t b= LD64(pixels+1);\
 457         uint64_t l0=  (a&0x0303030303030303ULL)\
 458                     + (b&0x0303030303030303ULL)\
 459                     + 0x0101010101010101ULL;\
 460         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 461                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 462         uint64_t l1,h1;\
 463 \
 464         pixels+=line_size;\
 465         for(i=0; i<h; i+=2){\
 466             uint64_t a= LD64(pixels  );\
 467             uint64_t b= LD64(pixels+1);\
 468             l1=  (a&0x0303030303030303ULL)\
 469                + (b&0x0303030303030303ULL);\
 470             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 471               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 472             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 473             pixels+=line_size;\
 474             block +=line_size;\
 475             a= LD64(pixels  );\
 476             b= LD64(pixels+1);\
 477             l0=  (a&0x0303030303030303ULL)\
 478                + (b&0x0303030303030303ULL)\
 479                + 0x0101010101010101ULL;\
 480             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 481               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 482             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 483             pixels+=line_size;\
 484             block +=line_size;\
 485         }\
 486 }\
 487 \
 488 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 489 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 490 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 491 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 492 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 493 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 494 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 495
 496 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 497 #else // 64 bit variant
 498
 499 #define PIXOP2(OPNAME, OP) \
 500 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 501     int i;\
 502     for(i=0; i<h; i++){\
 503         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
 504         pixels+=line_size;\
 505         block +=line_size;\
 506     }\
 507 }\
 508 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 509     int i;\
 510     for(i=0; i<h; i++){\
 511         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 512         pixels+=line_size;\
 513         block +=line_size;\
 514     }\
 515 }\
 516 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 517     int i;\
 518     for(i=0; i<h; i++){\
 519         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 520         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 521         pixels+=line_size;\
 522         block +=line_size;\
 523     }\
 524 }\
 525 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 526     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 527 }\
 528 \
 529 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 530                                                 int src_stride1, int src_stride2, int h){\
 531     int i;\
 532     for(i=0; i<h; i++){\
 533         uint32_t a,b;\
 534         a= LD32(&src1[i*src_stride1  ]);\
 535         b= LD32(&src2[i*src_stride2  ]);\
 536         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 537         a= LD32(&src1[i*src_stride1+4]);\
 538         b= LD32(&src2[i*src_stride2+4]);\
 539         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 540     }\
 541 }\
 542 \
 543 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 544                                                 int src_stride1, int src_stride2, int h){\
 545     int i;\
 546     for(i=0; i<h; i++){\
 547         uint32_t a,b;\
 548         a= LD32(&src1[i*src_stride1  ]);\
 549         b= LD32(&src2[i*src_stride2  ]);\
 550         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 551         a= LD32(&src1[i*src_stride1+4]);\
 552         b= LD32(&src2[i*src_stride2+4]);\
 553         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 554     }\
 555 }\
 556 \
 557 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 558                                                 int src_stride1, int src_stride2, int h){\
 559     int i;\
 560     for(i=0; i<h; i++){\
 561         uint32_t a,b;\
 562         a= LD32(&src1[i*src_stride1  ]);\
 563         b= LD32(&src2[i*src_stride2  ]);\
 564         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 565     }\
 566 }\
 567 \
 568 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 569                                                 int src_stride1, int src_stride2, int h){\
 570     int i;\
 571     for(i=0; i<h; i++){\
 572         uint32_t a,b;\
 573         a= LD16(&src1[i*src_stride1  ]);\
 574         b= LD16(&src2[i*src_stride2  ]);\
 575         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 576     }\
 577 }\
 578 \
 579 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 580                                                 int src_stride1, int src_stride2, int h){\
 581     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 582     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 583 }\
 584 \
 585 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 586                                                 int src_stride1, int src_stride2, int h){\
 587     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 588     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 589 }\
 590 \
 591 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 592     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 593 }\
 594 \
 595 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 596     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 597 }\
 598 \
 599 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 600     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 601 }\
 602 \
 603 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 604     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 605 }\
 606 \
 607 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 608                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 609     int i;\
 610     for(i=0; i<h; i++){\
 611         uint32_t a, b, c, d, l0, l1, h0, h1;\
 612         a= LD32(&src1[i*src_stride1]);\
 613         b= LD32(&src2[i*src_stride2]);\
 614         c= LD32(&src3[i*src_stride3]);\
 615         d= LD32(&src4[i*src_stride4]);\
 616         l0=  (a&0x03030303UL)\
 617            + (b&0x03030303UL)\
 618            + 0x02020202UL;\
 619         h0= ((a&0xFCFCFCFCUL)>>2)\
 620           + ((b&0xFCFCFCFCUL)>>2);\
 621         l1=  (c&0x03030303UL)\
 622            + (d&0x03030303UL);\
 623         h1= ((c&0xFCFCFCFCUL)>>2)\
 624           + ((d&0xFCFCFCFCUL)>>2);\
 625         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 626         a= LD32(&src1[i*src_stride1+4]);\
 627         b= LD32(&src2[i*src_stride2+4]);\
 628         c= LD32(&src3[i*src_stride3+4]);\
 629         d= LD32(&src4[i*src_stride4+4]);\
 630         l0=  (a&0x03030303UL)\
 631            + (b&0x03030303UL)\
 632            + 0x02020202UL;\
 633         h0= ((a&0xFCFCFCFCUL)>>2)\
 634           + ((b&0xFCFCFCFCUL)>>2);\
 635         l1=  (c&0x03030303UL)\
 636            + (d&0x03030303UL);\
 637         h1= ((c&0xFCFCFCFCUL)>>2)\
 638           + ((d&0xFCFCFCFCUL)>>2);\
 639         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 640     }\
 641 }\
 642 \
 643 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 644     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 645 }\
 646 \
 647 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 648     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 649 }\
 650 \
 651 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 652     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 653 }\
 654 \
 655 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 656     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 657 }\
 658 \
 659 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 660                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 661     int i;\
 662     for(i=0; i<h; i++){\
 663         uint32_t a, b, c, d, l0, l1, h0, h1;\
 664         a= LD32(&src1[i*src_stride1]);\
 665         b= LD32(&src2[i*src_stride2]);\
 666         c= LD32(&src3[i*src_stride3]);\
 667         d= LD32(&src4[i*src_stride4]);\
 668         l0=  (a&0x03030303UL)\
 669            + (b&0x03030303UL)\
 670            + 0x01010101UL;\
 671         h0= ((a&0xFCFCFCFCUL)>>2)\
 672           + ((b&0xFCFCFCFCUL)>>2);\
 673         l1=  (c&0x03030303UL)\
 674            + (d&0x03030303UL);\
 675         h1= ((c&0xFCFCFCFCUL)>>2)\
 676           + ((d&0xFCFCFCFCUL)>>2);\
 677         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 678         a= LD32(&src1[i*src_stride1+4]);\
 679         b= LD32(&src2[i*src_stride2+4]);\
 680         c= LD32(&src3[i*src_stride3+4]);\
 681         d= LD32(&src4[i*src_stride4+4]);\
 682         l0=  (a&0x03030303UL)\
 683            + (b&0x03030303UL)\
 684            + 0x01010101UL;\
 685         h0= ((a&0xFCFCFCFCUL)>>2)\
 686           + ((b&0xFCFCFCFCUL)>>2);\
 687         l1=  (c&0x03030303UL)\
 688            + (d&0x03030303UL);\
 689         h1= ((c&0xFCFCFCFCUL)>>2)\
 690           + ((d&0xFCFCFCFCUL)>>2);\
 691         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 692     }\
 693 }\
 694 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 695                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 696     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 697     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 698 }\
 699 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 700                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 701     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 702     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 703 }\
 704 \
 705 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 706 {\
 707         int i, a0, b0, a1, b1;\
 708         a0= pixels[0];\
 709         b0= pixels[1] + 2;\
 710         a0 += b0;\
 711         b0 += pixels[2];\
 712 \
 713         pixels+=line_size;\
 714         for(i=0; i<h; i+=2){\
 715             a1= pixels[0];\
 716             b1= pixels[1];\
 717             a1 += b1;\
 718             b1 += pixels[2];\
 719 \
 720             block[0]= (a1+a0)>>2; /* FIXME non put */\
 721             block[1]= (b1+b0)>>2;\
 722 \
 723             pixels+=line_size;\
 724             block +=line_size;\
 725 \
 726             a0= pixels[0];\
 727             b0= pixels[1] + 2;\
 728             a0 += b0;\
 729             b0 += pixels[2];\
 730 \
 731             block[0]= (a1+a0)>>2;\
 732             block[1]= (b1+b0)>>2;\
 733             pixels+=line_size;\
 734             block +=line_size;\
 735         }\
 736 }\
 737 \
 738 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 739 {\
 740         int i;\
 741         const uint32_t a= LD32(pixels  );\
 742         const uint32_t b= LD32(pixels+1);\
 743         uint32_t l0=  (a&0x03030303UL)\
 744                     + (b&0x03030303UL)\
 745                     + 0x02020202UL;\
 746         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 747                    + ((b&0xFCFCFCFCUL)>>2);\
 748         uint32_t l1,h1;\
 749 \
 750         pixels+=line_size;\
 751         for(i=0; i<h; i+=2){\
 752             uint32_t a= LD32(pixels  );\
 753             uint32_t b= LD32(pixels+1);\
 754             l1=  (a&0x03030303UL)\
 755                + (b&0x03030303UL);\
 756             h1= ((a&0xFCFCFCFCUL)>>2)\
 757               + ((b&0xFCFCFCFCUL)>>2);\
 758             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 759             pixels+=line_size;\
 760             block +=line_size;\
 761             a= LD32(pixels  );\
 762             b= LD32(pixels+1);\
 763             l0=  (a&0x03030303UL)\
 764                + (b&0x03030303UL)\
 765                + 0x02020202UL;\
 766             h0= ((a&0xFCFCFCFCUL)>>2)\
 767               + ((b&0xFCFCFCFCUL)>>2);\
 768             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 769             pixels+=line_size;\
 770             block +=line_size;\
 771         }\
 772 }\
 773 \
 774 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 775 {\
 776     int j;\
 777     for(j=0; j<2; j++){\
 778         int i;\
 779         const uint32_t a= LD32(pixels  );\
 780         const uint32_t b= LD32(pixels+1);\
 781         uint32_t l0=  (a&0x03030303UL)\
 782                     + (b&0x03030303UL)\
 783                     + 0x02020202UL;\
 784         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 785                    + ((b&0xFCFCFCFCUL)>>2);\
 786         uint32_t l1,h1;\
 787 \
 788         pixels+=line_size;\
 789         for(i=0; i<h; i+=2){\
 790             uint32_t a= LD32(pixels  );\
 791             uint32_t b= LD32(pixels+1);\
 792             l1=  (a&0x03030303UL)\
 793                + (b&0x03030303UL);\
 794             h1= ((a&0xFCFCFCFCUL)>>2)\
 795               + ((b&0xFCFCFCFCUL)>>2);\
 796             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 797             pixels+=line_size;\
 798             block +=line_size;\
 799             a= LD32(pixels  );\
 800             b= LD32(pixels+1);\
 801             l0=  (a&0x03030303UL)\
 802                + (b&0x03030303UL)\
 803                + 0x02020202UL;\
 804             h0= ((a&0xFCFCFCFCUL)>>2)\
 805               + ((b&0xFCFCFCFCUL)>>2);\
 806             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 807             pixels+=line_size;\
 808             block +=line_size;\
 809         }\
 810         pixels+=4-line_size*(h+1);\
 811         block +=4-line_size*h;\
 812     }\
 813 }\
 814 \
 815 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 816 {\
 817     int j;\
 818     for(j=0; j<2; j++){\
 819         int i;\
 820         const uint32_t a= LD32(pixels  );\
 821         const uint32_t b= LD32(pixels+1);\
 822         uint32_t l0=  (a&0x03030303UL)\
 823                     + (b&0x03030303UL)\
 824                     + 0x01010101UL;\
 825         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 826                    + ((b&0xFCFCFCFCUL)>>2);\
 827         uint32_t l1,h1;\
 828 \
 829         pixels+=line_size;\
 830         for(i=0; i<h; i+=2){\
 831             uint32_t a= LD32(pixels  );\
 832             uint32_t b= LD32(pixels+1);\
 833             l1=  (a&0x03030303UL)\
 834                + (b&0x03030303UL);\
 835             h1= ((a&0xFCFCFCFCUL)>>2)\
 836               + ((b&0xFCFCFCFCUL)>>2);\
 837             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 838             pixels+=line_size;\
 839             block +=line_size;\
 840             a= LD32(pixels  );\
 841             b= LD32(pixels+1);\
 842             l0=  (a&0x03030303UL)\
 843                + (b&0x03030303UL)\
 844                + 0x01010101UL;\
 845             h0= ((a&0xFCFCFCFCUL)>>2)\
 846               + ((b&0xFCFCFCFCUL)>>2);\
 847             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 848             pixels+=line_size;\
 849             block +=line_size;\
 850         }\
 851         pixels+=4-line_size*(h+1);\
 852         block +=4-line_size*h;\
 853     }\
 854 }\
 855 \
 856 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
 857 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
 858 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
 859 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
 860 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
 861 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
 862 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
 863 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
 864
 865 #define op_avg(a, b) a = rnd_avg32(a, b)
 866 #endif
 867 #define op_put(a, b) a = b
 868
 869 PIXOP2(avg, op_avg)
 870 PIXOP2(put, op_put)
 871 #undef op_avg
 872 #undef op_put
 873
 874 #define avg2(a,b) ((a+b+1)>>1)
 875 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 876
 877
 878 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 879 {
 880     const int A=(16-x16)*(16-y16);
 881     const int B=(   x16)*(16-y16);
 882     const int C=(16-x16)*(   y16);
 883     const int D=(   x16)*(   y16);
 884     int i;
 885
 886     for(i=0; i<h; i++)
 887     {
 888         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 889         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 890         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 891         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 892         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 893         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 894         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 895         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 896         dst+= stride;
 897         src+= stride;
 898     }
 899 }
 900
 901 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 902                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 903 {
 904     int y, vx, vy;
 905     const int s= 1<<shift;
 906
 907     width--;
 908     height--;
 909
 910     for(y=0; y<h; y++){
 911         int x;
 912
 913         vx= ox;
 914         vy= oy;
 915         for(x=0; x<8; x++){ //XXX FIXME optimize
 916             int src_x, src_y, frac_x, frac_y, index;
 917
 918             src_x= vx>>16;
 919             src_y= vy>>16;
 920             frac_x= src_x&(s-1);
 921             frac_y= src_y&(s-1);
 922             src_x>>=shift;
 923             src_y>>=shift;
 924
 925             if((unsigned)src_x < width){
 926                 if((unsigned)src_y < height){
 927                     index= src_x + src_y*stride;
 928                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 929                                            + src[index       +1]*   frac_x )*(s-frac_y)
 930                                         + (  src[index+stride  ]*(s-frac_x)
 931                                            + src[index+stride+1]*   frac_x )*   frac_y
 932                                         + r)>>(shift*2);
 933                 }else{
 934                     index= src_x + clip(src_y, 0, height)*stride;
 935                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 936                                           + src[index       +1]*   frac_x )*s
 937                                         + r)>>(shift*2);
 938                 }
 939             }else{
 940                 if((unsigned)src_y < height){
 941                     index= clip(src_x, 0, width) + src_y*stride;
 942                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 943                                            + src[index+stride  ]*   frac_y )*s
 944                                         + r)>>(shift*2);
 945                 }else{
 946                     index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
 947                     dst[y*stride + x]=    src[index         ];
 948                 }
 949             }
 950
 951             vx+= dxx;
 952             vy+= dyx;
 953         }
 954         ox += dxy;
 955         oy += dyy;
 956     }
 957 }
 958
 959 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 960     switch(width){
 961     case 2: put_pixels2_c (dst, src, stride, height); break;
 962     case 4: put_pixels4_c (dst, src, stride, height); break;
 963     case 8: put_pixels8_c (dst, src, stride, height); break;
 964     case 16:put_pixels16_c(dst, src, stride, height); break;
 965     }
 966 }
 967
 968 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 969     int i,j;
 970     for (i=0; i < height; i++) {
 971       for (j=0; j < width; j++) {
 972         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 973       }
 974       src += stride;
 975       dst += stride;
 976     }
 977 }
 978
 979 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 980     int i,j;
 981     for (i=0; i < height; i++) {
 982       for (j=0; j < width; j++) {
 983         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 984       }
 985       src += stride;
 986       dst += stride;
 987     }
 988 }
 989
 990 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 991     int i,j;
 992     for (i=0; i < height; i++) {
 993       for (j=0; j < width; j++) {
 994         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 995       }
 996       src += stride;
 997       dst += stride;
 998     }
 999 }
1000
1001 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1002     int i,j;
1003     for (i=0; i < height; i++) {
1004       for (j=0; j < width; j++) {
1005         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1006       }
1007       src += stride;
1008       dst += stride;
1009     }
1010 }
1011
1012 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1013     int i,j;
1014     for (i=0; i < height; i++) {
1015       for (j=0; j < width; j++) {
1016         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1017       }
1018       src += stride;
1019       dst += stride;
1020     }
1021 }
1022
1023 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1024     int i,j;
1025     for (i=0; i < height; i++) {
1026       for (j=0; j < width; j++) {
1027         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1028       }
1029       src += stride;
1030       dst += stride;
1031     }
1032 }
1033
1034 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1035     int i,j;
1036     for (i=0; i < height; i++) {
1037       for (j=0; j < width; j++) {
1038         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1039       }
1040       src += stride;
1041       dst += stride;
1042     }
1043 }
1044
1045 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1046     int i,j;
1047     for (i=0; i < height; i++) {
1048       for (j=0; j < width; j++) {
1049         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1050       }
1051       src += stride;
1052       dst += stride;
1053     }
1054 }
1055
1056 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1057     switch(width){
1058     case 2: avg_pixels2_c (dst, src, stride, height); break;
1059     case 4: avg_pixels4_c (dst, src, stride, height); break;
1060     case 8: avg_pixels8_c (dst, src, stride, height); break;
1061     case 16:avg_pixels16_c(dst, src, stride, height); break;
1062     }
1063 }
1064
1065 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1066     int i,j;
1067     for (i=0; i < height; i++) {
1068       for (j=0; j < width; j++) {
1069         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1070       }
1071       src += stride;
1072       dst += stride;
1073     }
1074 }
1075
1076 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1077     int i,j;
1078     for (i=0; i < height; i++) {
1079       for (j=0; j < width; j++) {
1080         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1081       }
1082       src += stride;
1083       dst += stride;
1084     }
1085 }
1086
1087 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1088     int i,j;
1089     for (i=0; i < height; i++) {
1090       for (j=0; j < width; j++) {
1091         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1092       }
1093       src += stride;
1094       dst += stride;
1095     }
1096 }
1097
1098 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1099     int i,j;
1100     for (i=0; i < height; i++) {
1101       for (j=0; j < width; j++) {
1102         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1103       }
1104       src += stride;
1105       dst += stride;
1106     }
1107 }
1108
1109 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1110     int i,j;
1111     for (i=0; i < height; i++) {
1112       for (j=0; j < width; j++) {
1113         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1114       }
1115       src += stride;
1116       dst += stride;
1117     }
1118 }
1119
1120 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1121     int i,j;
1122     for (i=0; i < height; i++) {
1123       for (j=0; j < width; j++) {
1124         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1125       }
1126       src += stride;
1127       dst += stride;
1128     }
1129 }
1130
1131 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1132     int i,j;
1133     for (i=0; i < height; i++) {
1134       for (j=0; j < width; j++) {
1135         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1136       }
1137       src += stride;
1138       dst += stride;
1139     }
1140 }
1141
1142 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1143     int i,j;
1144     for (i=0; i < height; i++) {
1145       for (j=0; j < width; j++) {
1146         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1147       }
1148       src += stride;
1149       dst += stride;
1150     }
1151 }
1152 #if 0
1153 #define TPEL_WIDTH(width)\
1154 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1155     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1156 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1157     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1158 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1159     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1160 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1161     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1162 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1163     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1164 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1165     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1166 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1167     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1168 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1169     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1170 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1171     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1172 #endif
1173
1174 #define H264_CHROMA_MC(OPNAME, OP)\
1175 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1176     const int A=(8-x)*(8-y);\
1177     const int B=(  x)*(8-y);\
1178     const int C=(8-x)*(  y);\
1179     const int D=(  x)*(  y);\
1180     int i;\
1181     \
1182     assert(x<8 && y<8 && x>=0 && y>=0);\
1183 \
1184     for(i=0; i<h; i++)\
1185     {\
1186         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1187         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1188         dst+= stride;\
1189         src+= stride;\
1190     }\
1191 }\
1192 \
1193 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1194     const int A=(8-x)*(8-y);\
1195     const int B=(  x)*(8-y);\
1196     const int C=(8-x)*(  y);\
1197     const int D=(  x)*(  y);\
1198     int i;\
1199     \
1200     assert(x<8 && y<8 && x>=0 && y>=0);\
1201 \
1202     for(i=0; i<h; i++)\
1203     {\
1204         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1205         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1206         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1207         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1208         dst+= stride;\
1209         src+= stride;\
1210     }\
1211 }\
1212 \
1213 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1214     const int A=(8-x)*(8-y);\
1215     const int B=(  x)*(8-y);\
1216     const int C=(8-x)*(  y);\
1217     const int D=(  x)*(  y);\
1218     int i;\
1219     \
1220     assert(x<8 && y<8 && x>=0 && y>=0);\
1221 \
1222     for(i=0; i<h; i++)\
1223     {\
1224         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1225         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1226         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1227         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1228         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1229         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1230         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1231         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1232         dst+= stride;\
1233         src+= stride;\
1234     }\
1235 }
1236
1237 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1238 #define op_put(a, b) a = (((b) + 32)>>6)
1239
1240 H264_CHROMA_MC(put_       , op_put)
1241 H264_CHROMA_MC(avg_       , op_avg)
1242 #undef op_avg
1243 #undef op_put
1244
1245 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1246 {
1247     int i;
1248     for(i=0; i<h; i++)
1249     {
1250         ST32(dst   , LD32(src   ));
1251         dst+=dstStride;
1252         src+=srcStride;
1253     }
1254 }
1255
1256 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1257 {
1258     int i;
1259     for(i=0; i<h; i++)
1260     {
1261         ST32(dst   , LD32(src   ));
1262         ST32(dst+4 , LD32(src+4 ));
1263         dst+=dstStride;
1264         src+=srcStride;
1265     }
1266 }
1267
1268 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1269 {
1270     int i;
1271     for(i=0; i<h; i++)
1272     {
1273         ST32(dst   , LD32(src   ));
1274         ST32(dst+4 , LD32(src+4 ));
1275         ST32(dst+8 , LD32(src+8 ));
1276         ST32(dst+12, LD32(src+12));
1277         dst+=dstStride;
1278         src+=srcStride;
1279     }
1280 }
1281
1282 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1283 {
1284     int i;
1285     for(i=0; i<h; i++)
1286     {
1287         ST32(dst   , LD32(src   ));
1288         ST32(dst+4 , LD32(src+4 ));
1289         ST32(dst+8 , LD32(src+8 ));
1290         ST32(dst+12, LD32(src+12));
1291         dst[16]= src[16];
1292         dst+=dstStride;
1293         src+=srcStride;
1294     }
1295 }
1296
1297 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1298 {
1299     int i;
1300     for(i=0; i<h; i++)
1301     {
1302         ST32(dst   , LD32(src   ));
1303         ST32(dst+4 , LD32(src+4 ));
1304         dst[8]= src[8];
1305         dst+=dstStride;
1306         src+=srcStride;
1307     }
1308 }
1309
1310
1311 #define QPEL_MC(r, OPNAME, RND, OP) \
1312 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1313     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1314     int i;\
1315     for(i=0; i<h; i++)\
1316     {\
1317         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1318         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1319         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1320         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1321         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1322         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1323         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1324         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1325         dst+=dstStride;\
1326         src+=srcStride;\
1327     }\
1328 }\
1329 \
1330 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1331     const int w=8;\
1332     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1333     int i;\
1334     for(i=0; i<w; i++)\
1335     {\
1336         const int src0= src[0*srcStride];\
1337         const int src1= src[1*srcStride];\
1338         const int src2= src[2*srcStride];\
1339         const int src3= src[3*srcStride];\
1340         const int src4= src[4*srcStride];\
1341         const int src5= src[5*srcStride];\
1342         const int src6= src[6*srcStride];\
1343         const int src7= src[7*srcStride];\
1344         const int src8= src[8*srcStride];\
1345         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1346         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1347         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1348         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1349         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1350         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1351         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1352         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1353         dst++;\
1354         src++;\
1355     }\
1356 }\
1357 \
1358 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1359     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1360     int i;\
1361     \
1362     for(i=0; i<h; i++)\
1363     {\
1364         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1365         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1366         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1367         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1368         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1369         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1370         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1371         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1372         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1373         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1374         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1375         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1376         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1377         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1378         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1379         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1380         dst+=dstStride;\
1381         src+=srcStride;\
1382     }\
1383 }\
1384 \
1385 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1386     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1387     int i;\
1388     const int w=16;\
1389     for(i=0; i<w; i++)\
1390     {\
1391         const int src0= src[0*srcStride];\
1392         const int src1= src[1*srcStride];\
1393         const int src2= src[2*srcStride];\
1394         const int src3= src[3*srcStride];\
1395         const int src4= src[4*srcStride];\
1396         const int src5= src[5*srcStride];\
1397         const int src6= src[6*srcStride];\
1398         const int src7= src[7*srcStride];\
1399         const int src8= src[8*srcStride];\
1400         const int src9= src[9*srcStride];\
1401         const int src10= src[10*srcStride];\
1402         const int src11= src[11*srcStride];\
1403         const int src12= src[12*srcStride];\
1404         const int src13= src[13*srcStride];\
1405         const int src14= src[14*srcStride];\
1406         const int src15= src[15*srcStride];\
1407         const int src16= src[16*srcStride];\
1408         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1409         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1410         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1411         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1412         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1413         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1414         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1415         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1416         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1417         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1418         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1419         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1420         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1421         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1422         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1423         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1424         dst++;\
1425         src++;\
1426     }\
1427 }\
1428 \
1429 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1430     OPNAME ## pixels8_c(dst, src, stride, 8);\
1431 }\
1432 \
1433 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1434     uint8_t half[64];\
1435     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1436     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1437 }\
1438 \
1439 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1440     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1441 }\
1442 \
1443 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1444     uint8_t half[64];\
1445     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1446     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1447 }\
1448 \
1449 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1450     uint8_t full[16*9];\
1451     uint8_t half[64];\
1452     copy_block9(full, src, 16, stride, 9);\
1453     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1454     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1455 }\
1456 \
1457 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1458     uint8_t full[16*9];\
1459     copy_block9(full, src, 16, stride, 9);\
1460     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1461 }\
1462 \
1463 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1464     uint8_t full[16*9];\
1465     uint8_t half[64];\
1466     copy_block9(full, src, 16, stride, 9);\
1467     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1468     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1469 }\
1470 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1471     uint8_t full[16*9];\
1472     uint8_t halfH[72];\
1473     uint8_t halfV[64];\
1474     uint8_t halfHV[64];\
1475     copy_block9(full, src, 16, stride, 9);\
1476     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1477     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1478     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1479     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1480 }\
1481 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1482     uint8_t full[16*9];\
1483     uint8_t halfH[72];\
1484     uint8_t halfHV[64];\
1485     copy_block9(full, src, 16, stride, 9);\
1486     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1487     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1488     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1489     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1490 }\
1491 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1492     uint8_t full[16*9];\
1493     uint8_t halfH[72];\
1494     uint8_t halfV[64];\
1495     uint8_t halfHV[64];\
1496     copy_block9(full, src, 16, stride, 9);\
1497     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1498     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1499     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1500     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1501 }\
1502 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1503     uint8_t full[16*9];\
1504     uint8_t halfH[72];\
1505     uint8_t halfHV[64];\
1506     copy_block9(full, src, 16, stride, 9);\
1507     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1508     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1509     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1510     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1511 }\
1512 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1513     uint8_t full[16*9];\
1514     uint8_t halfH[72];\
1515     uint8_t halfV[64];\
1516     uint8_t halfHV[64];\
1517     copy_block9(full, src, 16, stride, 9);\
1518     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1519     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1520     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1521     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1522 }\
1523 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1524     uint8_t full[16*9];\
1525     uint8_t halfH[72];\
1526     uint8_t halfHV[64];\
1527     copy_block9(full, src, 16, stride, 9);\
1528     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1529     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1530     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1531     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1532 }\
1533 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1534     uint8_t full[16*9];\
1535     uint8_t halfH[72];\
1536     uint8_t halfV[64];\
1537     uint8_t halfHV[64];\
1538     copy_block9(full, src, 16, stride, 9);\
1539     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1540     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1541     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1542     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1543 }\
1544 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1545     uint8_t full[16*9];\
1546     uint8_t halfH[72];\
1547     uint8_t halfHV[64];\
1548     copy_block9(full, src, 16, stride, 9);\
1549     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1550     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1551     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1552     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1553 }\
1554 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1555     uint8_t halfH[72];\
1556     uint8_t halfHV[64];\
1557     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1558     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1559     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1560 }\
1561 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1562     uint8_t halfH[72];\
1563     uint8_t halfHV[64];\
1564     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1565     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1566     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1567 }\
1568 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1569     uint8_t full[16*9];\
1570     uint8_t halfH[72];\
1571     uint8_t halfV[64];\
1572     uint8_t halfHV[64];\
1573     copy_block9(full, src, 16, stride, 9);\
1574     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1575     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1576     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1577     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1578 }\
1579 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1580     uint8_t full[16*9];\
1581     uint8_t halfH[72];\
1582     copy_block9(full, src, 16, stride, 9);\
1583     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1584     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1585     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1586 }\
1587 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1588     uint8_t full[16*9];\
1589     uint8_t halfH[72];\
1590     uint8_t halfV[64];\
1591     uint8_t halfHV[64];\
1592     copy_block9(full, src, 16, stride, 9);\
1593     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1594     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1595     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1596     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1597 }\
1598 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1599     uint8_t full[16*9];\
1600     uint8_t halfH[72];\
1601     copy_block9(full, src, 16, stride, 9);\
1602     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1603     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1604     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1605 }\
1606 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1607     uint8_t halfH[72];\
1608     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1609     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1610 }\
1611 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1612     OPNAME ## pixels16_c(dst, src, stride, 16);\
1613 }\
1614 \
1615 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1616     uint8_t half[256];\
1617     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1618     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1619 }\
1620 \
1621 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1622     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1623 }\
1624 \
1625 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1626     uint8_t half[256];\
1627     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1628     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1629 }\
1630 \
1631 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1632     uint8_t full[24*17];\
1633     uint8_t half[256];\
1634     copy_block17(full, src, 24, stride, 17);\
1635     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1636     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1637 }\
1638 \
1639 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1640     uint8_t full[24*17];\
1641     copy_block17(full, src, 24, stride, 17);\
1642     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1643 }\
1644 \
1645 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1646     uint8_t full[24*17];\
1647     uint8_t half[256];\
1648     copy_block17(full, src, 24, stride, 17);\
1649     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1650     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1651 }\
1652 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1653     uint8_t full[24*17];\
1654     uint8_t halfH[272];\
1655     uint8_t halfV[256];\
1656     uint8_t halfHV[256];\
1657     copy_block17(full, src, 24, stride, 17);\
1658     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1659     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1660     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1661     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1662 }\
1663 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1664     uint8_t full[24*17];\
1665     uint8_t halfH[272];\
1666     uint8_t halfHV[256];\
1667     copy_block17(full, src, 24, stride, 17);\
1668     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1669     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1670     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1671     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1672 }\
1673 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1674     uint8_t full[24*17];\
1675     uint8_t halfH[272];\
1676     uint8_t halfV[256];\
1677     uint8_t halfHV[256];\
1678     copy_block17(full, src, 24, stride, 17);\
1679     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1680     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1681     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1682     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1683 }\
1684 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1685     uint8_t full[24*17];\
1686     uint8_t halfH[272];\
1687     uint8_t halfHV[256];\
1688     copy_block17(full, src, 24, stride, 17);\
1689     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1690     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1691     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1692     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1693 }\
1694 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1695     uint8_t full[24*17];\
1696     uint8_t halfH[272];\
1697     uint8_t halfV[256];\
1698     uint8_t halfHV[256];\
1699     copy_block17(full, src, 24, stride, 17);\
1700     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1701     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1702     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1703     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1704 }\
1705 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1706     uint8_t full[24*17];\
1707     uint8_t halfH[272];\
1708     uint8_t halfHV[256];\
1709     copy_block17(full, src, 24, stride, 17);\
1710     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1711     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1712     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1713     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1714 }\
1715 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1716     uint8_t full[24*17];\
1717     uint8_t halfH[272];\
1718     uint8_t halfV[256];\
1719     uint8_t halfHV[256];\
1720     copy_block17(full, src, 24, stride, 17);\
1721     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1722     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1723     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1724     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1725 }\
1726 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1727     uint8_t full[24*17];\
1728     uint8_t halfH[272];\
1729     uint8_t halfHV[256];\
1730     copy_block17(full, src, 24, stride, 17);\
1731     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1732     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1733     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1734     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1735 }\
1736 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1737     uint8_t halfH[272];\
1738     uint8_t halfHV[256];\
1739     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1740     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1741     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1742 }\
1743 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1744     uint8_t halfH[272];\
1745     uint8_t halfHV[256];\
1746     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1747     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1748     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1749 }\
1750 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1751     uint8_t full[24*17];\
1752     uint8_t halfH[272];\
1753     uint8_t halfV[256];\
1754     uint8_t halfHV[256];\
1755     copy_block17(full, src, 24, stride, 17);\
1756     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1757     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1758     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1759     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1760 }\
1761 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1762     uint8_t full[24*17];\
1763     uint8_t halfH[272];\
1764     copy_block17(full, src, 24, stride, 17);\
1765     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1766     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1767     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1768 }\
1769 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1770     uint8_t full[24*17];\
1771     uint8_t halfH[272];\
1772     uint8_t halfV[256];\
1773     uint8_t halfHV[256];\
1774     copy_block17(full, src, 24, stride, 17);\
1775     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1776     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1777     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1778     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1779 }\
1780 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1781     uint8_t full[24*17];\
1782     uint8_t halfH[272];\
1783     copy_block17(full, src, 24, stride, 17);\
1784     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1785     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1786     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1787 }\
1788 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1789     uint8_t halfH[272];\
1790     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1791     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1792 }
1793
1794 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1795 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1796 #define op_put(a, b) a = cm[((b) + 16)>>5]
1797 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1798
1799 QPEL_MC(0, put_       , _       , op_put)
1800 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1801 QPEL_MC(0, avg_       , _       , op_avg)
1802 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1803 #undef op_avg
1804 #undef op_avg_no_rnd
1805 #undef op_put
1806 #undef op_put_no_rnd
1807
1808 #if 1
1809 #define H264_LOWPASS(OPNAME, OP, OP2) \
1810 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1811     const int h=4;\
1812     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1813     int i;\
1814     for(i=0; i<h; i++)\
1815     {\
1816         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1817         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1818         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1819         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1820         dst+=dstStride;\
1821         src+=srcStride;\
1822     }\
1823 }\
1824 \
1825 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1826     const int w=4;\
1827     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1828     int i;\
1829     for(i=0; i<w; i++)\
1830     {\
1831         const int srcB= src[-2*srcStride];\
1832         const int srcA= src[-1*srcStride];\
1833         const int src0= src[0 *srcStride];\
1834         const int src1= src[1 *srcStride];\
1835         const int src2= src[2 *srcStride];\
1836         const int src3= src[3 *srcStride];\
1837         const int src4= src[4 *srcStride];\
1838         const int src5= src[5 *srcStride];\
1839         const int src6= src[6 *srcStride];\
1840         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1841         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1842         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1843         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1844         dst++;\
1845         src++;\
1846     }\
1847 }\
1848 \
1849 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1850     const int h=4;\
1851     const int w=4;\
1852     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1853     int i;\
1854     src -= 2*srcStride;\
1855     for(i=0; i<h+5; i++)\
1856     {\
1857         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1858         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1859         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1860         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1861         tmp+=tmpStride;\
1862         src+=srcStride;\
1863     }\
1864     tmp -= tmpStride*(h+5-2);\
1865     for(i=0; i<w; i++)\
1866     {\
1867         const int tmpB= tmp[-2*tmpStride];\
1868         const int tmpA= tmp[-1*tmpStride];\
1869         const int tmp0= tmp[0 *tmpStride];\
1870         const int tmp1= tmp[1 *tmpStride];\
1871         const int tmp2= tmp[2 *tmpStride];\
1872         const int tmp3= tmp[3 *tmpStride];\
1873         const int tmp4= tmp[4 *tmpStride];\
1874         const int tmp5= tmp[5 *tmpStride];\
1875         const int tmp6= tmp[6 *tmpStride];\
1876         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1877         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1878         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1879         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1880         dst++;\
1881         tmp++;\
1882     }\
1883 }\
1884 \
1885 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1886     const int h=8;\
1887     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1888     int i;\
1889     for(i=0; i<h; i++)\
1890     {\
1891         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1892         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1893         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1894         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1895         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1896         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1897         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1898         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1899         dst+=dstStride;\
1900         src+=srcStride;\
1901     }\
1902 }\
1903 \
1904 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1905     const int w=8;\
1906     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1907     int i;\
1908     for(i=0; i<w; i++)\
1909     {\
1910         const int srcB= src[-2*srcStride];\
1911         const int srcA= src[-1*srcStride];\
1912         const int src0= src[0 *srcStride];\
1913         const int src1= src[1 *srcStride];\
1914         const int src2= src[2 *srcStride];\
1915         const int src3= src[3 *srcStride];\
1916         const int src4= src[4 *srcStride];\
1917         const int src5= src[5 *srcStride];\
1918         const int src6= src[6 *srcStride];\
1919         const int src7= src[7 *srcStride];\
1920         const int src8= src[8 *srcStride];\
1921         const int src9= src[9 *srcStride];\
1922         const int src10=src[10*srcStride];\
1923         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1924         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1925         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1926         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1927         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1928         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1929         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1930         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1931         dst++;\
1932         src++;\
1933     }\
1934 }\
1935 \
1936 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1937     const int h=8;\
1938     const int w=8;\
1939     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1940     int i;\
1941     src -= 2*srcStride;\
1942     for(i=0; i<h+5; i++)\
1943     {\
1944         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1945         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1946         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1947         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1948         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1949         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1950         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1951         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1952         tmp+=tmpStride;\
1953         src+=srcStride;\
1954     }\
1955     tmp -= tmpStride*(h+5-2);\
1956     for(i=0; i<w; i++)\
1957     {\
1958         const int tmpB= tmp[-2*tmpStride];\
1959         const int tmpA= tmp[-1*tmpStride];\
1960         const int tmp0= tmp[0 *tmpStride];\
1961         const int tmp1= tmp[1 *tmpStride];\
1962         const int tmp2= tmp[2 *tmpStride];\
1963         const int tmp3= tmp[3 *tmpStride];\
1964         const int tmp4= tmp[4 *tmpStride];\
1965         const int tmp5= tmp[5 *tmpStride];\
1966         const int tmp6= tmp[6 *tmpStride];\
1967         const int tmp7= tmp[7 *tmpStride];\
1968         const int tmp8= tmp[8 *tmpStride];\
1969         const int tmp9= tmp[9 *tmpStride];\
1970         const int tmp10=tmp[10*tmpStride];\
1971         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1972         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1973         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1974         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1975         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1976         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1977         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1978         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1979         dst++;\
1980         tmp++;\
1981     }\
1982 }\
1983 \
1984 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1985     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1986     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1987     src += 8*srcStride;\
1988     dst += 8*dstStride;\
1989     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1990     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1991 }\
1992 \
1993 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1994     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1995     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1996     src += 8*srcStride;\
1997     dst += 8*dstStride;\
1998     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1999     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2000 }\
2001 \
2002 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2003     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2004     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2005     src += 8*srcStride;\
2006     tmp += 8*tmpStride;\
2007     dst += 8*dstStride;\
2008     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2009     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2010 }\
2011
2012 #define H264_MC(OPNAME, SIZE) \
2013 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2014     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2015 }\
2016 \
2017 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2018     uint8_t half[SIZE*SIZE];\
2019     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2020     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2021 }\
2022 \
2023 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2024     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2025 }\
2026 \
2027 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2028     uint8_t half[SIZE*SIZE];\
2029     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2030     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2031 }\
2032 \
2033 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2034     uint8_t full[SIZE*(SIZE+5)];\
2035     uint8_t * const full_mid= full + SIZE*2;\
2036     uint8_t half[SIZE*SIZE];\
2037     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2038     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2039     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2040 }\
2041 \
2042 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2043     uint8_t full[SIZE*(SIZE+5)];\
2044     uint8_t * const full_mid= full + SIZE*2;\
2045     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2046     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2047 }\
2048 \
2049 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2050     uint8_t full[SIZE*(SIZE+5)];\
2051     uint8_t * const full_mid= full + SIZE*2;\
2052     uint8_t half[SIZE*SIZE];\
2053     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2054     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2055     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2056 }\
2057 \
2058 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2059     uint8_t full[SIZE*(SIZE+5)];\
2060     uint8_t * const full_mid= full + SIZE*2;\
2061     uint8_t halfH[SIZE*SIZE];\
2062     uint8_t halfV[SIZE*SIZE];\
2063     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2064     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2065     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2066     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2067 }\
2068 \
2069 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2070     uint8_t full[SIZE*(SIZE+5)];\
2071     uint8_t * const full_mid= full + SIZE*2;\
2072     uint8_t halfH[SIZE*SIZE];\
2073     uint8_t halfV[SIZE*SIZE];\
2074     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2075     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2076     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2077     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2078 }\
2079 \
2080 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2081     uint8_t full[SIZE*(SIZE+5)];\
2082     uint8_t * const full_mid= full + SIZE*2;\
2083     uint8_t halfH[SIZE*SIZE];\
2084     uint8_t halfV[SIZE*SIZE];\
2085     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2086     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2087     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2088     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2089 }\
2090 \
2091 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2092     uint8_t full[SIZE*(SIZE+5)];\
2093     uint8_t * const full_mid= full + SIZE*2;\
2094     uint8_t halfH[SIZE*SIZE];\
2095     uint8_t halfV[SIZE*SIZE];\
2096     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2097     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2098     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2099     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2100 }\
2101 \
2102 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2103     int16_t tmp[SIZE*(SIZE+5)];\
2104     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2105 }\
2106 \
2107 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2108     int16_t tmp[SIZE*(SIZE+5)];\
2109     uint8_t halfH[SIZE*SIZE];\
2110     uint8_t halfHV[SIZE*SIZE];\
2111     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2112     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2113     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2114 }\
2115 \
2116 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2117     int16_t tmp[SIZE*(SIZE+5)];\
2118     uint8_t halfH[SIZE*SIZE];\
2119     uint8_t halfHV[SIZE*SIZE];\
2120     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2121     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2122     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2123 }\
2124 \
2125 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2126     uint8_t full[SIZE*(SIZE+5)];\
2127     uint8_t * const full_mid= full + SIZE*2;\
2128     int16_t tmp[SIZE*(SIZE+5)];\
2129     uint8_t halfV[SIZE*SIZE];\
2130     uint8_t halfHV[SIZE*SIZE];\
2131     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2132     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2133     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2134     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2135 }\
2136 \
2137 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2138     uint8_t full[SIZE*(SIZE+5)];\
2139     uint8_t * const full_mid= full + SIZE*2;\
2140     int16_t tmp[SIZE*(SIZE+5)];\
2141     uint8_t halfV[SIZE*SIZE];\
2142     uint8_t halfHV[SIZE*SIZE];\
2143     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2144     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2145     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2146     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2147 }\
2148
2149 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2150 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2151 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2152 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2153 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2154
2155 H264_LOWPASS(put_       , op_put, op2_put)
2156 H264_LOWPASS(avg_       , op_avg, op2_avg)
2157 H264_MC(put_, 4)
2158 H264_MC(put_, 8)
2159 H264_MC(put_, 16)
2160 H264_MC(avg_, 4)
2161 H264_MC(avg_, 8)
2162 H264_MC(avg_, 16)
2163
2164 #undef op_avg
2165 #undef op_put
2166 #undef op2_avg
2167 #undef op2_put
2168 #endif
2169
2170 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2171     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2172     int i;
2173
2174     for(i=0; i<h; i++){
2175         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2176         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2177         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2178         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2179         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2180         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2181         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2182         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2183         dst+=dstStride;
2184         src+=srcStride;
2185     }
2186 }
2187
2188 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2189     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2190     int i;
2191
2192     for(i=0; i<w; i++){
2193         const int src_1= src[ -srcStride];
2194         const int src0 = src[0          ];
2195         const int src1 = src[  srcStride];
2196         const int src2 = src[2*srcStride];
2197         const int src3 = src[3*srcStride];
2198         const int src4 = src[4*srcStride];
2199         const int src5 = src[5*srcStride];
2200         const int src6 = src[6*srcStride];
2201         const int src7 = src[7*srcStride];
2202         const int src8 = src[8*srcStride];
2203         const int src9 = src[9*srcStride];
2204         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2205         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2206         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2207         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2208         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2209         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2210         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2211         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2212         src++;
2213         dst++;
2214     }
2215 }
2216
2217 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2218     put_pixels8_c(dst, src, stride, 8);
2219 }
2220
2221 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2222     uint8_t half[64];
2223     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2224     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2225 }
2226
2227 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2228     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2229 }
2230
2231 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2232     uint8_t half[64];
2233     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2234     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2235 }
2236
2237 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2238     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2239 }
2240
2241 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2242     uint8_t halfH[88];
2243     uint8_t halfV[64];
2244     uint8_t halfHV[64];
2245     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2246     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2247     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2248     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2249 }
2250 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2251     uint8_t halfH[88];
2252     uint8_t halfV[64];
2253     uint8_t halfHV[64];
2254     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2255     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2256     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2257     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2258 }
2259 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2260     uint8_t halfH[88];
2261     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2262     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2263 }
2264
2265 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2266     int x;
2267     const int strength= ff_h263_loop_filter_strength[qscale];
2268
2269     for(x=0; x<8; x++){
2270         int d1, d2, ad1;
2271         int p0= src[x-2*stride];
2272         int p1= src[x-1*stride];
2273         int p2= src[x+0*stride];
2274         int p3= src[x+1*stride];
2275         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2276
2277         if     (d<-2*strength) d1= 0;
2278         else if(d<-  strength) d1=-2*strength - d;
2279         else if(d<   strength) d1= d;
2280         else if(d< 2*strength) d1= 2*strength - d;
2281         else                   d1= 0;
2282
2283         p1 += d1;
2284         p2 -= d1;
2285         if(p1&256) p1= ~(p1>>31);
2286         if(p2&256) p2= ~(p2>>31);
2287
2288         src[x-1*stride] = p1;
2289         src[x+0*stride] = p2;
2290
2291         ad1= ABS(d1)>>1;
2292
2293         d2= clip((p0-p3)/4, -ad1, ad1);
2294
2295         src[x-2*stride] = p0 - d2;
2296         src[x+  stride] = p3 + d2;
2297     }
2298 }
2299
2300 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2301     int y;
2302     const int strength= ff_h263_loop_filter_strength[qscale];
2303
2304     for(y=0; y<8; y++){
2305         int d1, d2, ad1;
2306         int p0= src[y*stride-2];
2307         int p1= src[y*stride-1];
2308         int p2= src[y*stride+0];
2309         int p3= src[y*stride+1];
2310         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2311
2312         if     (d<-2*strength) d1= 0;
2313         else if(d<-  strength) d1=-2*strength - d;
2314         else if(d<   strength) d1= d;
2315         else if(d< 2*strength) d1= 2*strength - d;
2316         else                   d1= 0;
2317
2318         p1 += d1;
2319         p2 -= d1;
2320         if(p1&256) p1= ~(p1>>31);
2321         if(p2&256) p2= ~(p2>>31);
2322
2323         src[y*stride-1] = p1;
2324         src[y*stride+0] = p2;
2325
2326         ad1= ABS(d1)>>1;
2327
2328         d2= clip((p0-p3)/4, -ad1, ad1);
2329
2330         src[y*stride-2] = p0 - d2;
2331         src[y*stride+1] = p3 + d2;
2332     }
2333 }
2334
2335 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2336 {
2337     int s, i;
2338
2339     s = 0;
2340     for(i=0;i<h;i++) {
2341         s += abs(pix1[0] - pix2[0]);
2342         s += abs(pix1[1] - pix2[1]);
2343         s += abs(pix1[2] - pix2[2]);
2344         s += abs(pix1[3] - pix2[3]);
2345         s += abs(pix1[4] - pix2[4]);
2346         s += abs(pix1[5] - pix2[5]);
2347         s += abs(pix1[6] - pix2[6]);
2348         s += abs(pix1[7] - pix2[7]);
2349         s += abs(pix1[8] - pix2[8]);
2350         s += abs(pix1[9] - pix2[9]);
2351         s += abs(pix1[10] - pix2[10]);
2352         s += abs(pix1[11] - pix2[11]);
2353         s += abs(pix1[12] - pix2[12]);
2354         s += abs(pix1[13] - pix2[13]);
2355         s += abs(pix1[14] - pix2[14]);
2356         s += abs(pix1[15] - pix2[15]);
2357         pix1 += line_size;
2358         pix2 += line_size;
2359     }
2360     return s;
2361 }
2362
2363 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2364 {
2365     int s, i;
2366
2367     s = 0;
2368     for(i=0;i<h;i++) {
2369         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2370         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2371         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2372         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2373         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2374         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2375         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2376         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2377         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2378         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2379         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2380         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2381         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2382         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2383         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2384         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2385         pix1 += line_size;
2386         pix2 += line_size;
2387     }
2388     return s;
2389 }
2390
2391 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2392 {
2393     int s, i;
2394     uint8_t *pix3 = pix2 + line_size;
2395
2396     s = 0;
2397     for(i=0;i<h;i++) {
2398         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2399         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2400         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2401         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2402         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2403         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2404         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2405         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2406         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2407         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2408         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2409         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2410         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2411         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2412         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2413         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2414         pix1 += line_size;
2415         pix2 += line_size;
2416         pix3 += line_size;
2417     }
2418     return s;
2419 }
2420
2421 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2422 {
2423     int s, i;
2424     uint8_t *pix3 = pix2 + line_size;
2425
2426     s = 0;
2427     for(i=0;i<h;i++) {
2428         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2429         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2430         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2431         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2432         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2433         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2434         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2435         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2436         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2437         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2438         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2439         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2440         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2441         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2442         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2443         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2444         pix1 += line_size;
2445         pix2 += line_size;
2446         pix3 += line_size;
2447     }
2448     return s;
2449 }
2450
2451 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2452 {
2453     int s, i;
2454
2455     s = 0;
2456     for(i=0;i<h;i++) {
2457         s += abs(pix1[0] - pix2[0]);
2458         s += abs(pix1[1] - pix2[1]);
2459         s += abs(pix1[2] - pix2[2]);
2460         s += abs(pix1[3] - pix2[3]);
2461         s += abs(pix1[4] - pix2[4]);
2462         s += abs(pix1[5] - pix2[5]);
2463         s += abs(pix1[6] - pix2[6]);
2464         s += abs(pix1[7] - pix2[7]);
2465         pix1 += line_size;
2466         pix2 += line_size;
2467     }
2468     return s;
2469 }
2470
2471 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2472 {
2473     int s, i;
2474
2475     s = 0;
2476     for(i=0;i<h;i++) {
2477         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2478         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2479         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2480         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2481         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2482         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2483         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2484         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2485         pix1 += line_size;
2486         pix2 += line_size;
2487     }
2488     return s;
2489 }
2490
2491 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2492 {
2493     int s, i;
2494     uint8_t *pix3 = pix2 + line_size;
2495
2496     s = 0;
2497     for(i=0;i<h;i++) {
2498         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2499         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2500         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2501         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2502         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2503         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2504         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2505         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2506         pix1 += line_size;
2507         pix2 += line_size;
2508         pix3 += line_size;
2509     }
2510     return s;
2511 }
2512
2513 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2514 {
2515     int s, i;
2516     uint8_t *pix3 = pix2 + line_size;
2517
2518     s = 0;
2519     for(i=0;i<h;i++) {
2520         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2521         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2522         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2523         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2524         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2525         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2526         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2527         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2528         pix1 += line_size;
2529         pix2 += line_size;
2530         pix3 += line_size;
2531     }
2532     return s;
2533 }
2534
2535 /**
2536  * permutes an 8x8 block.
2537  * @param block the block which will be permuted according to the given permutation vector
2538  * @param permutation the permutation vector
2539  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2540  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2541  *                  (inverse) permutated to scantable order!
2542  */
2543 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2544 {
2545     int i;
2546     DCTELEM temp[64];
2547
2548     if(last<=0) return;
2549     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2550
2551     for(i=0; i<=last; i++){
2552         const int j= scantable[i];
2553         temp[j]= block[j];
2554         block[j]=0;
2555     }
2556
2557     for(i=0; i<=last; i++){
2558         const int j= scantable[i];
2559         const int perm_j= permutation[j];
2560         block[perm_j]= temp[j];
2561     }
2562 }
2563
2564 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2565     return 0;
2566 }
2567
2568 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2569     int i;
2570
2571     memset(cmp, 0, sizeof(void*)*5);
2572
2573     for(i=0; i<5; i++){
2574         switch(type&0xFF){
2575         case FF_CMP_SAD:
2576             cmp[i]= c->sad[i];
2577             break;
2578         case FF_CMP_SATD:
2579             cmp[i]= c->hadamard8_diff[i];
2580             break;
2581         case FF_CMP_SSE:
2582             cmp[i]= c->sse[i];
2583             break;
2584         case FF_CMP_DCT:
2585             cmp[i]= c->dct_sad[i];
2586             break;
2587         case FF_CMP_PSNR:
2588             cmp[i]= c->quant_psnr[i];
2589             break;
2590         case FF_CMP_BIT:
2591             cmp[i]= c->bit[i];
2592             break;
2593         case FF_CMP_RD:
2594             cmp[i]= c->rd[i];
2595             break;
2596         case FF_CMP_VSAD:
2597             cmp[i]= c->vsad[i];
2598             break;
2599         case FF_CMP_VSSE:
2600             cmp[i]= c->vsse[i];
2601             break;
2602         case FF_CMP_ZERO:
2603             cmp[i]= zero_cmp;
2604             break;
2605         default:
2606             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2607         }
2608     }
2609 }
2610
2611 /**
2612  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2613  */
2614 static void clear_blocks_c(DCTELEM *blocks)
2615 {
2616     memset(blocks, 0, sizeof(DCTELEM)*6*64);
2617 }
2618
2619 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2620     int i;
2621     for(i=0; i+7<w; i+=8){
2622         dst[i+0] += src[i+0];
2623         dst[i+1] += src[i+1];
2624         dst[i+2] += src[i+2];
2625         dst[i+3] += src[i+3];
2626         dst[i+4] += src[i+4];
2627         dst[i+5] += src[i+5];
2628         dst[i+6] += src[i+6];
2629         dst[i+7] += src[i+7];
2630     }
2631     for(; i<w; i++)
2632         dst[i+0] += src[i+0];
2633 }
2634
2635 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2636     int i;
2637     for(i=0; i+7<w; i+=8){
2638         dst[i+0] = src1[i+0]-src2[i+0];
2639         dst[i+1] = src1[i+1]-src2[i+1];
2640         dst[i+2] = src1[i+2]-src2[i+2];
2641         dst[i+3] = src1[i+3]-src2[i+3];
2642         dst[i+4] = src1[i+4]-src2[i+4];
2643         dst[i+5] = src1[i+5]-src2[i+5];
2644         dst[i+6] = src1[i+6]-src2[i+6];
2645         dst[i+7] = src1[i+7]-src2[i+7];
2646     }
2647     for(; i<w; i++)
2648         dst[i+0] = src1[i+0]-src2[i+0];
2649 }
2650
2651 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2652     int i;
2653     uint8_t l, lt;
2654
2655     l= *left;
2656     lt= *left_top;
2657
2658     for(i=0; i<w; i++){
2659         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2660         lt= src1[i];
2661         l= src2[i];
2662         dst[i]= l - pred;
2663     }
2664
2665     *left= l;
2666     *left_top= lt;
2667 }
2668
2669 #define BUTTERFLY2(o1,o2,i1,i2) \
2670 o1= (i1)+(i2);\
2671 o2= (i1)-(i2);
2672
2673 #define BUTTERFLY1(x,y) \
2674 {\
2675     int a,b;\
2676     a= x;\
2677     b= y;\
2678     x= a+b;\
2679     y= a-b;\
2680 }
2681
2682 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2683
2684 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2685     int i;
2686     int temp[64];
2687     int sum=0;
2688
2689     assert(h==8);
2690
2691     for(i=0; i<8; i++){
2692         //FIXME try pointer walks
2693         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2694         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2695         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2696         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2697
2698         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2699         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2700         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2701         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2702
2703         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2704         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2705         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2706         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2707     }
2708
2709     for(i=0; i<8; i++){
2710         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2711         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2712         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2713         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2714
2715         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2716         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2717         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2718         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2719
2720         sum +=
2721              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2722             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2723             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2724             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2725     }
2726 #if 0
2727 static int maxi=0;
2728 if(sum>maxi){
2729     maxi=sum;
2730     printf("MAX:%d\n", maxi);
2731 }
2732 #endif
2733     return sum;
2734 }
2735
2736 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2737     int i;
2738     int temp[64];
2739     int sum=0;
2740
2741     assert(h==8);
2742
2743     for(i=0; i<8; i++){
2744         //FIXME try pointer walks
2745         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2746         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2747         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2748         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2749
2750         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2751         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2752         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2753         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2754
2755         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2756         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2757         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2758         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2759     }
2760
2761     for(i=0; i<8; i++){
2762         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2763         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2764         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2765         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2766
2767         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2768         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2769         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2770         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2771
2772         sum +=
2773              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2774             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2775             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2776             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2777     }
2778
2779     sum -= ABS(temp[8*0] + temp[8*4]); // -mean
2780
2781     return sum;
2782 }
2783
2784 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2785     MpegEncContext * const s= (MpegEncContext *)c;
2786     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2787     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2788     int sum=0, i;
2789
2790     assert(h==8);
2791
2792     s->dsp.diff_pixels(temp, src1, src2, stride);
2793     s->dsp.fdct(temp);
2794
2795     for(i=0; i<64; i++)
2796         sum+= ABS(temp[i]);
2797
2798     return sum;
2799 }
2800
2801 void simple_idct(DCTELEM *block); //FIXME
2802
2803 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2804     MpegEncContext * const s= (MpegEncContext *)c;
2805     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2806     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2807     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2808     int sum=0, i;
2809
2810     assert(h==8);
2811     s->mb_intra=0;
2812
2813     s->dsp.diff_pixels(temp, src1, src2, stride);
2814
2815     memcpy(bak, temp, 64*sizeof(DCTELEM));
2816
2817     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2818     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2819     simple_idct(temp); //FIXME
2820
2821     for(i=0; i<64; i++)
2822         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2823
2824     return sum;
2825 }
2826
2827 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2828     MpegEncContext * const s= (MpegEncContext *)c;
2829     const uint8_t *scantable= s->intra_scantable.permutated;
2830     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2831     uint64_t __align8 aligned_bak[stride];
2832     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2833     uint8_t * const bak= (uint8_t*)aligned_bak;
2834     int i, last, run, bits, level, distoration, start_i;
2835     const int esc_length= s->ac_esc_length;
2836     uint8_t * length;
2837     uint8_t * last_length;
2838
2839     assert(h==8);
2840
2841     for(i=0; i<8; i++){
2842         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2843         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2844     }
2845
2846     s->dsp.diff_pixels(temp, src1, src2, stride);
2847
2848     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2849
2850     bits=0;
2851
2852     if (s->mb_intra) {
2853         start_i = 1;
2854         length     = s->intra_ac_vlc_length;
2855         last_length= s->intra_ac_vlc_last_length;
2856         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2857     } else {
2858         start_i = 0;
2859         length     = s->inter_ac_vlc_length;
2860         last_length= s->inter_ac_vlc_last_length;
2861     }
2862
2863     if(last>=start_i){
2864         run=0;
2865         for(i=start_i; i<last; i++){
2866             int j= scantable[i];
2867             level= temp[j];
2868
2869             if(level){
2870                 level+=64;
2871                 if((level&(~127)) == 0){
2872                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2873                 }else
2874                     bits+= esc_length;
2875                 run=0;
2876             }else
2877                 run++;
2878         }
2879         i= scantable[last];
2880
2881         level= temp[i] + 64;
2882
2883         assert(level - 64);
2884
2885         if((level&(~127)) == 0){
2886             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2887         }else
2888             bits+= esc_length;
2889
2890     }
2891
2892     if(last>=0){
2893         if(s->mb_intra)
2894             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2895         else
2896             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2897     }
2898
2899     s->dsp.idct_add(bak, stride, temp);
2900
2901     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
2902
2903     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2904 }
2905
2906 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2907     MpegEncContext * const s= (MpegEncContext *)c;
2908     const uint8_t *scantable= s->intra_scantable.permutated;
2909     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2910     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2911     int i, last, run, bits, level, start_i;
2912     const int esc_length= s->ac_esc_length;
2913     uint8_t * length;
2914     uint8_t * last_length;
2915
2916     assert(h==8);
2917
2918     s->dsp.diff_pixels(temp, src1, src2, stride);
2919
2920     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2921
2922     bits=0;
2923
2924     if (s->mb_intra) {
2925         start_i = 1;
2926         length     = s->intra_ac_vlc_length;
2927         last_length= s->intra_ac_vlc_last_length;
2928         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2929     } else {
2930         start_i = 0;
2931         length     = s->inter_ac_vlc_length;
2932         last_length= s->inter_ac_vlc_last_length;
2933     }
2934
2935     if(last>=start_i){
2936         run=0;
2937         for(i=start_i; i<last; i++){
2938             int j= scantable[i];
2939             level= temp[j];
2940
2941             if(level){
2942                 level+=64;
2943                 if((level&(~127)) == 0){
2944                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2945                 }else
2946                     bits+= esc_length;
2947                 run=0;
2948             }else
2949                 run++;
2950         }
2951         i= scantable[last];
2952
2953         level= temp[i] + 64;
2954
2955         assert(level - 64);
2956
2957         if((level&(~127)) == 0){
2958             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2959         }else
2960             bits+= esc_length;
2961     }
2962
2963     return bits;
2964 }
2965
2966 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
2967     int score=0;
2968     int x,y;
2969
2970     for(y=1; y<h; y++){
2971         for(x=0; x<16; x+=4){
2972             score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride])
2973                    +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
2974         }
2975         s+= stride;
2976     }
2977
2978     return score;
2979 }
2980
2981 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2982     int score=0;
2983     int x,y;
2984
2985     for(y=1; y<h; y++){
2986         for(x=0; x<16; x++){
2987             score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2988         }
2989         s1+= stride;
2990         s2+= stride;
2991     }
2992
2993     return score;
2994 }
2995
2996 #define SQ(a) ((a)*(a))
2997 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
2998     int score=0;
2999     int x,y;
3000
3001     for(y=1; y<h; y++){
3002         for(x=0; x<16; x+=4){
3003             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3004                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3005         }
3006         s+= stride;
3007     }
3008
3009     return score;
3010 }
3011
3012 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3013     int score=0;
3014     int x,y;
3015
3016     for(y=1; y<h; y++){
3017         for(x=0; x<16; x++){
3018             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3019         }
3020         s1+= stride;
3021         s2+= stride;
3022     }
3023
3024     return score;
3025 }
3026
3027 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3028 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3029 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3030 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3031 WARPER8_16_SQ(rd8x8_c, rd16_c)
3032 WARPER8_16_SQ(bit8x8_c, bit16_c)
3033
3034 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3035  converted */
3036 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3037 {
3038     j_rev_dct (block);
3039     put_pixels_clamped_c(block, dest, line_size);
3040 }
3041 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3042 {
3043     j_rev_dct (block);
3044     add_pixels_clamped_c(block, dest, line_size);
3045 }
3046
3047 /* init static data */
3048 void dsputil_static_init(void)
3049 {
3050     int i;
3051
3052     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3053     for(i=0;i<MAX_NEG_CROP;i++) {
3054         cropTbl[i] = 0;
3055         cropTbl[i + MAX_NEG_CROP + 256] = 255;
3056     }
3057
3058     for(i=0;i<512;i++) {
3059         squareTbl[i] = (i - 256) * (i - 256);
3060     }
3061
3062     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3063 }
3064
3065
3066 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3067 {
3068     int i;
3069
3070 #ifdef CONFIG_ENCODERS
3071     if(avctx->dct_algo==FF_DCT_FASTINT) {
3072         c->fdct = fdct_ifast;
3073         c->fdct248 = fdct_ifast248;
3074     }
3075     else if(avctx->dct_algo==FF_DCT_FAAN) {
3076         c->fdct = ff_faandct;
3077         c->fdct248 = ff_faandct248;
3078     }
3079     else {
3080         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3081         c->fdct248 = ff_fdct248_islow;
3082     }
3083 #endif //CONFIG_ENCODERS
3084
3085     if(avctx->idct_algo==FF_IDCT_INT){
3086         c->idct_put= ff_jref_idct_put;
3087         c->idct_add= ff_jref_idct_add;
3088         c->idct    = j_rev_dct;
3089         c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3090     }else{ //accurate/default
3091         c->idct_put= simple_idct_put;
3092         c->idct_add= simple_idct_add;
3093         c->idct    = simple_idct;
3094         c->idct_permutation_type= FF_NO_IDCT_PERM;
3095     }
3096
3097     c->get_pixels = get_pixels_c;
3098     c->diff_pixels = diff_pixels_c;
3099     c->put_pixels_clamped = put_pixels_clamped_c;
3100     c->add_pixels_clamped = add_pixels_clamped_c;
3101     c->gmc1 = gmc1_c;
3102     c->gmc = gmc_c;
3103     c->clear_blocks = clear_blocks_c;
3104     c->pix_sum = pix_sum_c;
3105     c->pix_norm1 = pix_norm1_c;
3106
3107     /* TODO [0] 16  [1] 8 */
3108     c->pix_abs[0][0] = pix_abs16_c;
3109     c->pix_abs[0][1] = pix_abs16_x2_c;
3110     c->pix_abs[0][2] = pix_abs16_y2_c;
3111     c->pix_abs[0][3] = pix_abs16_xy2_c;
3112     c->pix_abs[1][0] = pix_abs8_c;
3113     c->pix_abs[1][1] = pix_abs8_x2_c;
3114     c->pix_abs[1][2] = pix_abs8_y2_c;
3115     c->pix_abs[1][3] = pix_abs8_xy2_c;
3116
3117 #define dspfunc(PFX, IDX, NUM) \
3118     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3119     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3120     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3121     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3122
3123     dspfunc(put, 0, 16);
3124     dspfunc(put_no_rnd, 0, 16);
3125     dspfunc(put, 1, 8);
3126     dspfunc(put_no_rnd, 1, 8);
3127     dspfunc(put, 2, 4);
3128     dspfunc(put, 3, 2);
3129
3130     dspfunc(avg, 0, 16);
3131     dspfunc(avg_no_rnd, 0, 16);
3132     dspfunc(avg, 1, 8);
3133     dspfunc(avg_no_rnd, 1, 8);
3134     dspfunc(avg, 2, 4);
3135     dspfunc(avg, 3, 2);
3136 #undef dspfunc
3137
3138     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3139     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3140     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3141     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3142     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3143     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3144     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3145     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3146     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3147
3148     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3149     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3150     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3151     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3152     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3153     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3154     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3155     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3156     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3157
3158 #define dspfunc(PFX, IDX, NUM) \
3159     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3160     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3161     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3162     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3163     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3164     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3165     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3166     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3167     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3168     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3169     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3170     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3171     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3172     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3173     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3174     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3175
3176     dspfunc(put_qpel, 0, 16);
3177     dspfunc(put_no_rnd_qpel, 0, 16);
3178
3179     dspfunc(avg_qpel, 0, 16);
3180     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3181
3182     dspfunc(put_qpel, 1, 8);
3183     dspfunc(put_no_rnd_qpel, 1, 8);
3184
3185     dspfunc(avg_qpel, 1, 8);
3186     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3187
3188     dspfunc(put_h264_qpel, 0, 16);
3189     dspfunc(put_h264_qpel, 1, 8);
3190     dspfunc(put_h264_qpel, 2, 4);
3191     dspfunc(avg_h264_qpel, 0, 16);
3192     dspfunc(avg_h264_qpel, 1, 8);
3193     dspfunc(avg_h264_qpel, 2, 4);
3194
3195 #undef dspfunc
3196     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3197     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3198     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3199     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3200     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3201     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3202
3203     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3204     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3205     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3206     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3207     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3208     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3209     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3210     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3211
3212 #define SET_CMP_FUNC(name) \
3213     c->name[0]= name ## 16_c;\
3214     c->name[1]= name ## 8x8_c;
3215
3216     SET_CMP_FUNC(hadamard8_diff)
3217     c->hadamard8_diff[4]= hadamard8_intra16_c;
3218     SET_CMP_FUNC(dct_sad)
3219     c->sad[0]= pix_abs16_c;
3220     c->sad[1]= pix_abs8_c;
3221     c->sse[0]= sse16_c;
3222     c->sse[1]= sse8_c;
3223     SET_CMP_FUNC(quant_psnr)
3224     SET_CMP_FUNC(rd)
3225     SET_CMP_FUNC(bit)
3226     c->vsad[0]= vsad16_c;
3227     c->vsad[4]= vsad_intra16_c;
3228     c->vsse[0]= vsse16_c;
3229     c->vsse[4]= vsse_intra16_c;
3230
3231     c->add_bytes= add_bytes_c;
3232     c->diff_bytes= diff_bytes_c;
3233     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3234     c->bswap_buf= bswap_buf;
3235
3236     c->h263_h_loop_filter= h263_h_loop_filter_c;
3237     c->h263_v_loop_filter= h263_v_loop_filter_c;
3238
3239 #ifdef HAVE_MMX
3240     dsputil_init_mmx(c, avctx);
3241 #endif
3242 #ifdef ARCH_ARMV4L
3243     dsputil_init_armv4l(c, avctx);
3244 #endif
3245 #ifdef HAVE_MLIB
3246     dsputil_init_mlib(c, avctx);
3247 #endif
3248 #ifdef ARCH_ALPHA
3249     dsputil_init_alpha(c, avctx);
3250 #endif
3251 #ifdef ARCH_POWERPC
3252     dsputil_init_ppc(c, avctx);
3253 #endif
3254 #ifdef HAVE_MMI
3255     dsputil_init_mmi(c, avctx);
3256 #endif
3257 #ifdef ARCH_SH4
3258     dsputil_init_sh4(c,avctx);
3259 #endif
3260
3261     switch(c->idct_permutation_type){
3262     case FF_NO_IDCT_PERM:
3263         for(i=0; i<64; i++)
3264             c->idct_permutation[i]= i;
3265         break;
3266     case FF_LIBMPEG2_IDCT_PERM:
3267         for(i=0; i<64; i++)
3268             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3269         break;
3270     case FF_SIMPLE_IDCT_PERM:
3271         for(i=0; i<64; i++)
3272             c->idct_permutation[i]= simple_mmx_permutation[i];
3273         break;
3274     case FF_TRANSPOSE_IDCT_PERM:
3275         for(i=0; i<64; i++)
3276             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3277         break;
3278     default:
3279         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3280     }
3281 }
3282