git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18  *
  19  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  20  */
  21
  22 /**
  23  * @file dsputil.c
  24  * DSP utils
  25  */
  26
  27 #include "avcodec.h"
  28 #include "dsputil.h"
  29 #include "mpegvideo.h"
  30 #include "simple_idct.h"
  31 #include "faandct.h"
  32
  33 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
  34 uint32_t squareTbl[512];
  35
  36 const uint8_t ff_zigzag_direct[64] = {
  37     0,   1,  8, 16,  9,  2,  3, 10,
  38     17, 24, 32, 25, 18, 11,  4,  5,
  39     12, 19, 26, 33, 40, 48, 41, 34,
  40     27, 20, 13,  6,  7, 14, 21, 28,
  41     35, 42, 49, 56, 57, 50, 43, 36,
  42     29, 22, 15, 23, 30, 37, 44, 51,
  43     58, 59, 52, 45, 38, 31, 39, 46,
  44     53, 60, 61, 54, 47, 55, 62, 63
  45 };
  46
  47 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  48    specification, we interleave the fields */
  49 const uint8_t ff_zigzag248_direct[64] = {
  50      0,  8,  1,  9, 16, 24,  2, 10,
  51     17, 25, 32, 40, 48, 56, 33, 41,
  52     18, 26,  3, 11,  4, 12, 19, 27,
  53     34, 42, 49, 57, 50, 58, 35, 43,
  54     20, 28,  5, 13,  6, 14, 21, 29,
  55     36, 44, 51, 59, 52, 60, 37, 45,
  56     22, 30,  7, 15, 23, 31, 38, 46,
  57     53, 61, 54, 62, 39, 47, 55, 63,
  58 };
  59
  60 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  61 uint16_t __align8 inv_zigzag_direct16[64];
  62
  63 const uint8_t ff_alternate_horizontal_scan[64] = {
  64     0,  1,   2,  3,  8,  9, 16, 17,
  65     10, 11,  4,  5,  6,  7, 15, 14,
  66     13, 12, 19, 18, 24, 25, 32, 33,
  67     26, 27, 20, 21, 22, 23, 28, 29,
  68     30, 31, 34, 35, 40, 41, 48, 49,
  69     42, 43, 36, 37, 38, 39, 44, 45,
  70     46, 47, 50, 51, 56, 57, 58, 59,
  71     52, 53, 54, 55, 60, 61, 62, 63,
  72 };
  73
  74 const uint8_t ff_alternate_vertical_scan[64] = {
  75     0,  8,  16, 24,  1,  9,  2, 10,
  76     17, 25, 32, 40, 48, 56, 57, 49,
  77     41, 33, 26, 18,  3, 11,  4, 12,
  78     19, 27, 34, 42, 50, 58, 35, 43,
  79     51, 59, 20, 28,  5, 13,  6, 14,
  80     21, 29, 36, 44, 52, 60, 37, 45,
  81     53, 61, 22, 30,  7, 15, 23, 31,
  82     38, 46, 54, 62, 39, 47, 55, 63,
  83 };
  84
  85 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  86 const uint32_t inverse[256]={
  87          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  88  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
  89  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
  90  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
  91  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
  92  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
  93   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
  94   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
  95   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
  96   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
  97   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
  98   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
  99   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 100   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 101   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 102   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 103   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 104   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 105   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 106   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 107   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 108   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 109   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 110   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 111   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 112   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 113   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 114   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 115   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 116   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 117   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 118   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 119 };
 120
 121 /* Input permutation for the simple_idct_mmx */
 122 static const uint8_t simple_mmx_permutation[64]={
 123         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 124         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 125         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 126         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 127         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 128         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 129         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 130         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 131 };
 132
 133 static int pix_sum_c(uint8_t * pix, int line_size)
 134 {
 135     int s, i, j;
 136
 137     s = 0;
 138     for (i = 0; i < 16; i++) {
 139         for (j = 0; j < 16; j += 8) {
 140             s += pix[0];
 141             s += pix[1];
 142             s += pix[2];
 143             s += pix[3];
 144             s += pix[4];
 145             s += pix[5];
 146             s += pix[6];
 147             s += pix[7];
 148             pix += 8;
 149         }
 150         pix += line_size - 16;
 151     }
 152     return s;
 153 }
 154
 155 static int pix_norm1_c(uint8_t * pix, int line_size)
 156 {
 157     int s, i, j;
 158     uint32_t *sq = squareTbl + 256;
 159
 160     s = 0;
 161     for (i = 0; i < 16; i++) {
 162         for (j = 0; j < 16; j += 8) {
 163 #if 0
 164             s += sq[pix[0]];
 165             s += sq[pix[1]];
 166             s += sq[pix[2]];
 167             s += sq[pix[3]];
 168             s += sq[pix[4]];
 169             s += sq[pix[5]];
 170             s += sq[pix[6]];
 171             s += sq[pix[7]];
 172 #else
 173 #if LONG_MAX > 2147483647
 174             register uint64_t x=*(uint64_t*)pix;
 175             s += sq[x&0xff];
 176             s += sq[(x>>8)&0xff];
 177             s += sq[(x>>16)&0xff];
 178             s += sq[(x>>24)&0xff];
 179             s += sq[(x>>32)&0xff];
 180             s += sq[(x>>40)&0xff];
 181             s += sq[(x>>48)&0xff];
 182             s += sq[(x>>56)&0xff];
 183 #else
 184             register uint32_t x=*(uint32_t*)pix;
 185             s += sq[x&0xff];
 186             s += sq[(x>>8)&0xff];
 187             s += sq[(x>>16)&0xff];
 188             s += sq[(x>>24)&0xff];
 189             x=*(uint32_t*)(pix+4);
 190             s += sq[x&0xff];
 191             s += sq[(x>>8)&0xff];
 192             s += sq[(x>>16)&0xff];
 193             s += sq[(x>>24)&0xff];
 194 #endif
 195 #endif
 196             pix += 8;
 197         }
 198         pix += line_size - 16;
 199     }
 200     return s;
 201 }
 202
 203 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 204     int i;
 205
 206     for(i=0; i+8<=w; i+=8){
 207         dst[i+0]= bswap_32(src[i+0]);
 208         dst[i+1]= bswap_32(src[i+1]);
 209         dst[i+2]= bswap_32(src[i+2]);
 210         dst[i+3]= bswap_32(src[i+3]);
 211         dst[i+4]= bswap_32(src[i+4]);
 212         dst[i+5]= bswap_32(src[i+5]);
 213         dst[i+6]= bswap_32(src[i+6]);
 214         dst[i+7]= bswap_32(src[i+7]);
 215     }
 216     for(;i<w; i++){
 217         dst[i+0]= bswap_32(src[i+0]);
 218     }
 219 }
 220
 221 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
 222 {
 223     int s, i;
 224     uint32_t *sq = squareTbl + 256;
 225
 226     s = 0;
 227     for (i = 0; i < 8; i++) {
 228         s += sq[pix1[0] - pix2[0]];
 229         s += sq[pix1[1] - pix2[1]];
 230         s += sq[pix1[2] - pix2[2]];
 231         s += sq[pix1[3] - pix2[3]];
 232         s += sq[pix1[4] - pix2[4]];
 233         s += sq[pix1[5] - pix2[5]];
 234         s += sq[pix1[6] - pix2[6]];
 235         s += sq[pix1[7] - pix2[7]];
 236         pix1 += line_size;
 237         pix2 += line_size;
 238     }
 239     return s;
 240 }
 241
 242 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
 243 {
 244     int s, i;
 245     uint32_t *sq = squareTbl + 256;
 246
 247     s = 0;
 248     for (i = 0; i < 16; i++) {
 249         s += sq[pix1[ 0] - pix2[ 0]];
 250         s += sq[pix1[ 1] - pix2[ 1]];
 251         s += sq[pix1[ 2] - pix2[ 2]];
 252         s += sq[pix1[ 3] - pix2[ 3]];
 253         s += sq[pix1[ 4] - pix2[ 4]];
 254         s += sq[pix1[ 5] - pix2[ 5]];
 255         s += sq[pix1[ 6] - pix2[ 6]];
 256         s += sq[pix1[ 7] - pix2[ 7]];
 257         s += sq[pix1[ 8] - pix2[ 8]];
 258         s += sq[pix1[ 9] - pix2[ 9]];
 259         s += sq[pix1[10] - pix2[10]];
 260         s += sq[pix1[11] - pix2[11]];
 261         s += sq[pix1[12] - pix2[12]];
 262         s += sq[pix1[13] - pix2[13]];
 263         s += sq[pix1[14] - pix2[14]];
 264         s += sq[pix1[15] - pix2[15]];
 265
 266         pix1 += line_size;
 267         pix2 += line_size;
 268     }
 269     return s;
 270 }
 271
 272 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 273 {
 274     int i;
 275
 276     /* read the pixels */
 277     for(i=0;i<8;i++) {
 278         block[0] = pixels[0];
 279         block[1] = pixels[1];
 280         block[2] = pixels[2];
 281         block[3] = pixels[3];
 282         block[4] = pixels[4];
 283         block[5] = pixels[5];
 284         block[6] = pixels[6];
 285         block[7] = pixels[7];
 286         pixels += line_size;
 287         block += 8;
 288     }
 289 }
 290
 291 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 292                           const uint8_t *s2, int stride){
 293     int i;
 294
 295     /* read the pixels */
 296     for(i=0;i<8;i++) {
 297         block[0] = s1[0] - s2[0];
 298         block[1] = s1[1] - s2[1];
 299         block[2] = s1[2] - s2[2];
 300         block[3] = s1[3] - s2[3];
 301         block[4] = s1[4] - s2[4];
 302         block[5] = s1[5] - s2[5];
 303         block[6] = s1[6] - s2[6];
 304         block[7] = s1[7] - s2[7];
 305         s1 += stride;
 306         s2 += stride;
 307         block += 8;
 308     }
 309 }
 310
 311
 312 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 313                                  int line_size)
 314 {
 315     int i;
 316     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 317
 318     /* read the pixels */
 319     for(i=0;i<8;i++) {
 320         pixels[0] = cm[block[0]];
 321         pixels[1] = cm[block[1]];
 322         pixels[2] = cm[block[2]];
 323         pixels[3] = cm[block[3]];
 324         pixels[4] = cm[block[4]];
 325         pixels[5] = cm[block[5]];
 326         pixels[6] = cm[block[6]];
 327         pixels[7] = cm[block[7]];
 328
 329         pixels += line_size;
 330         block += 8;
 331     }
 332 }
 333
 334 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 335                           int line_size)
 336 {
 337     int i;
 338     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 339
 340     /* read the pixels */
 341     for(i=0;i<8;i++) {
 342         pixels[0] = cm[pixels[0] + block[0]];
 343         pixels[1] = cm[pixels[1] + block[1]];
 344         pixels[2] = cm[pixels[2] + block[2]];
 345         pixels[3] = cm[pixels[3] + block[3]];
 346         pixels[4] = cm[pixels[4] + block[4]];
 347         pixels[5] = cm[pixels[5] + block[5]];
 348         pixels[6] = cm[pixels[6] + block[6]];
 349         pixels[7] = cm[pixels[7] + block[7]];
 350         pixels += line_size;
 351         block += 8;
 352     }
 353 }
 354 #if 0
 355
 356 #define PIXOP2(OPNAME, OP) \
 357 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 358 {\
 359     int i;\
 360     for(i=0; i<h; i++){\
 361         OP(*((uint64_t*)block), LD64(pixels));\
 362         pixels+=line_size;\
 363         block +=line_size;\
 364     }\
 365 }\
 366 \
 367 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 368 {\
 369     int i;\
 370     for(i=0; i<h; i++){\
 371         const uint64_t a= LD64(pixels  );\
 372         const uint64_t b= LD64(pixels+1);\
 373         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 374         pixels+=line_size;\
 375         block +=line_size;\
 376     }\
 377 }\
 378 \
 379 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 380 {\
 381     int i;\
 382     for(i=0; i<h; i++){\
 383         const uint64_t a= LD64(pixels  );\
 384         const uint64_t b= LD64(pixels+1);\
 385         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 386         pixels+=line_size;\
 387         block +=line_size;\
 388     }\
 389 }\
 390 \
 391 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 392 {\
 393     int i;\
 394     for(i=0; i<h; i++){\
 395         const uint64_t a= LD64(pixels          );\
 396         const uint64_t b= LD64(pixels+line_size);\
 397         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 398         pixels+=line_size;\
 399         block +=line_size;\
 400     }\
 401 }\
 402 \
 403 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 404 {\
 405     int i;\
 406     for(i=0; i<h; i++){\
 407         const uint64_t a= LD64(pixels          );\
 408         const uint64_t b= LD64(pixels+line_size);\
 409         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 410         pixels+=line_size;\
 411         block +=line_size;\
 412     }\
 413 }\
 414 \
 415 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 416 {\
 417         int i;\
 418         const uint64_t a= LD64(pixels  );\
 419         const uint64_t b= LD64(pixels+1);\
 420         uint64_t l0=  (a&0x0303030303030303ULL)\
 421                     + (b&0x0303030303030303ULL)\
 422                     + 0x0202020202020202ULL;\
 423         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 424                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 425         uint64_t l1,h1;\
 426 \
 427         pixels+=line_size;\
 428         for(i=0; i<h; i+=2){\
 429             uint64_t a= LD64(pixels  );\
 430             uint64_t b= LD64(pixels+1);\
 431             l1=  (a&0x0303030303030303ULL)\
 432                + (b&0x0303030303030303ULL);\
 433             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 434               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 435             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 436             pixels+=line_size;\
 437             block +=line_size;\
 438             a= LD64(pixels  );\
 439             b= LD64(pixels+1);\
 440             l0=  (a&0x0303030303030303ULL)\
 441                + (b&0x0303030303030303ULL)\
 442                + 0x0202020202020202ULL;\
 443             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 444               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 445             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 446             pixels+=line_size;\
 447             block +=line_size;\
 448         }\
 449 }\
 450 \
 451 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 452 {\
 453         int i;\
 454         const uint64_t a= LD64(pixels  );\
 455         const uint64_t b= LD64(pixels+1);\
 456         uint64_t l0=  (a&0x0303030303030303ULL)\
 457                     + (b&0x0303030303030303ULL)\
 458                     + 0x0101010101010101ULL;\
 459         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 460                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 461         uint64_t l1,h1;\
 462 \
 463         pixels+=line_size;\
 464         for(i=0; i<h; i+=2){\
 465             uint64_t a= LD64(pixels  );\
 466             uint64_t b= LD64(pixels+1);\
 467             l1=  (a&0x0303030303030303ULL)\
 468                + (b&0x0303030303030303ULL);\
 469             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 470               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 471             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 472             pixels+=line_size;\
 473             block +=line_size;\
 474             a= LD64(pixels  );\
 475             b= LD64(pixels+1);\
 476             l0=  (a&0x0303030303030303ULL)\
 477                + (b&0x0303030303030303ULL)\
 478                + 0x0101010101010101ULL;\
 479             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 480               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 481             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 482             pixels+=line_size;\
 483             block +=line_size;\
 484         }\
 485 }\
 486 \
 487 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 488 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 489 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 490 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 491 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 492 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 493 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 494
 495 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 496 #else // 64 bit variant
 497
 498 #define PIXOP2(OPNAME, OP) \
 499 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 500     int i;\
 501     for(i=0; i<h; i++){\
 502         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
 503         pixels+=line_size;\
 504         block +=line_size;\
 505     }\
 506 }\
 507 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 508     int i;\
 509     for(i=0; i<h; i++){\
 510         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 511         pixels+=line_size;\
 512         block +=line_size;\
 513     }\
 514 }\
 515 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 516     int i;\
 517     for(i=0; i<h; i++){\
 518         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 519         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 520         pixels+=line_size;\
 521         block +=line_size;\
 522     }\
 523 }\
 524 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 525     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 526 }\
 527 \
 528 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 529                                                 int src_stride1, int src_stride2, int h){\
 530     int i;\
 531     for(i=0; i<h; i++){\
 532         uint32_t a,b;\
 533         a= LD32(&src1[i*src_stride1  ]);\
 534         b= LD32(&src2[i*src_stride2  ]);\
 535         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 536         a= LD32(&src1[i*src_stride1+4]);\
 537         b= LD32(&src2[i*src_stride2+4]);\
 538         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 539     }\
 540 }\
 541 \
 542 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 543                                                 int src_stride1, int src_stride2, int h){\
 544     int i;\
 545     for(i=0; i<h; i++){\
 546         uint32_t a,b;\
 547         a= LD32(&src1[i*src_stride1  ]);\
 548         b= LD32(&src2[i*src_stride2  ]);\
 549         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 550         a= LD32(&src1[i*src_stride1+4]);\
 551         b= LD32(&src2[i*src_stride2+4]);\
 552         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 553     }\
 554 }\
 555 \
 556 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 557                                                 int src_stride1, int src_stride2, int h){\
 558     int i;\
 559     for(i=0; i<h; i++){\
 560         uint32_t a,b;\
 561         a= LD32(&src1[i*src_stride1  ]);\
 562         b= LD32(&src2[i*src_stride2  ]);\
 563         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 564     }\
 565 }\
 566 \
 567 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 568                                                 int src_stride1, int src_stride2, int h){\
 569     int i;\
 570     for(i=0; i<h; i++){\
 571         uint32_t a,b;\
 572         a= LD16(&src1[i*src_stride1  ]);\
 573         b= LD16(&src2[i*src_stride2  ]);\
 574         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 575     }\
 576 }\
 577 \
 578 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 579                                                 int src_stride1, int src_stride2, int h){\
 580     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 581     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 582 }\
 583 \
 584 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 585                                                 int src_stride1, int src_stride2, int h){\
 586     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 587     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 588 }\
 589 \
 590 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 591     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 592 }\
 593 \
 594 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 595     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 596 }\
 597 \
 598 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 599     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 600 }\
 601 \
 602 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 603     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 604 }\
 605 \
 606 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 607                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 608     int i;\
 609     for(i=0; i<h; i++){\
 610         uint32_t a, b, c, d, l0, l1, h0, h1;\
 611         a= LD32(&src1[i*src_stride1]);\
 612         b= LD32(&src2[i*src_stride2]);\
 613         c= LD32(&src3[i*src_stride3]);\
 614         d= LD32(&src4[i*src_stride4]);\
 615         l0=  (a&0x03030303UL)\
 616            + (b&0x03030303UL)\
 617            + 0x02020202UL;\
 618         h0= ((a&0xFCFCFCFCUL)>>2)\
 619           + ((b&0xFCFCFCFCUL)>>2);\
 620         l1=  (c&0x03030303UL)\
 621            + (d&0x03030303UL);\
 622         h1= ((c&0xFCFCFCFCUL)>>2)\
 623           + ((d&0xFCFCFCFCUL)>>2);\
 624         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 625         a= LD32(&src1[i*src_stride1+4]);\
 626         b= LD32(&src2[i*src_stride2+4]);\
 627         c= LD32(&src3[i*src_stride3+4]);\
 628         d= LD32(&src4[i*src_stride4+4]);\
 629         l0=  (a&0x03030303UL)\
 630            + (b&0x03030303UL)\
 631            + 0x02020202UL;\
 632         h0= ((a&0xFCFCFCFCUL)>>2)\
 633           + ((b&0xFCFCFCFCUL)>>2);\
 634         l1=  (c&0x03030303UL)\
 635            + (d&0x03030303UL);\
 636         h1= ((c&0xFCFCFCFCUL)>>2)\
 637           + ((d&0xFCFCFCFCUL)>>2);\
 638         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 639     }\
 640 }\
 641 \
 642 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 643     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 644 }\
 645 \
 646 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 647     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 648 }\
 649 \
 650 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 651     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 652 }\
 653 \
 654 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 655     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 656 }\
 657 \
 658 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 659                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 660     int i;\
 661     for(i=0; i<h; i++){\
 662         uint32_t a, b, c, d, l0, l1, h0, h1;\
 663         a= LD32(&src1[i*src_stride1]);\
 664         b= LD32(&src2[i*src_stride2]);\
 665         c= LD32(&src3[i*src_stride3]);\
 666         d= LD32(&src4[i*src_stride4]);\
 667         l0=  (a&0x03030303UL)\
 668            + (b&0x03030303UL)\
 669            + 0x01010101UL;\
 670         h0= ((a&0xFCFCFCFCUL)>>2)\
 671           + ((b&0xFCFCFCFCUL)>>2);\
 672         l1=  (c&0x03030303UL)\
 673            + (d&0x03030303UL);\
 674         h1= ((c&0xFCFCFCFCUL)>>2)\
 675           + ((d&0xFCFCFCFCUL)>>2);\
 676         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 677         a= LD32(&src1[i*src_stride1+4]);\
 678         b= LD32(&src2[i*src_stride2+4]);\
 679         c= LD32(&src3[i*src_stride3+4]);\
 680         d= LD32(&src4[i*src_stride4+4]);\
 681         l0=  (a&0x03030303UL)\
 682            + (b&0x03030303UL)\
 683            + 0x01010101UL;\
 684         h0= ((a&0xFCFCFCFCUL)>>2)\
 685           + ((b&0xFCFCFCFCUL)>>2);\
 686         l1=  (c&0x03030303UL)\
 687            + (d&0x03030303UL);\
 688         h1= ((c&0xFCFCFCFCUL)>>2)\
 689           + ((d&0xFCFCFCFCUL)>>2);\
 690         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 691     }\
 692 }\
 693 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 694                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 695     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 696     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 697 }\
 698 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 699                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 700     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 701     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 702 }\
 703 \
 704 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 705 {\
 706         int i, a0, b0, a1, b1;\
 707         a0= pixels[0];\
 708         b0= pixels[1] + 2;\
 709         a0 += b0;\
 710         b0 += pixels[2];\
 711 \
 712         pixels+=line_size;\
 713         for(i=0; i<h; i+=2){\
 714             a1= pixels[0];\
 715             b1= pixels[1];\
 716             a1 += b1;\
 717             b1 += pixels[2];\
 718 \
 719             block[0]= (a1+a0)>>2; /* FIXME non put */\
 720             block[1]= (b1+b0)>>2;\
 721 \
 722             pixels+=line_size;\
 723             block +=line_size;\
 724 \
 725             a0= pixels[0];\
 726             b0= pixels[1] + 2;\
 727             a0 += b0;\
 728             b0 += pixels[2];\
 729 \
 730             block[0]= (a1+a0)>>2;\
 731             block[1]= (b1+b0)>>2;\
 732             pixels+=line_size;\
 733             block +=line_size;\
 734         }\
 735 }\
 736 \
 737 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 738 {\
 739         int i;\
 740         const uint32_t a= LD32(pixels  );\
 741         const uint32_t b= LD32(pixels+1);\
 742         uint32_t l0=  (a&0x03030303UL)\
 743                     + (b&0x03030303UL)\
 744                     + 0x02020202UL;\
 745         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 746                    + ((b&0xFCFCFCFCUL)>>2);\
 747         uint32_t l1,h1;\
 748 \
 749         pixels+=line_size;\
 750         for(i=0; i<h; i+=2){\
 751             uint32_t a= LD32(pixels  );\
 752             uint32_t b= LD32(pixels+1);\
 753             l1=  (a&0x03030303UL)\
 754                + (b&0x03030303UL);\
 755             h1= ((a&0xFCFCFCFCUL)>>2)\
 756               + ((b&0xFCFCFCFCUL)>>2);\
 757             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 758             pixels+=line_size;\
 759             block +=line_size;\
 760             a= LD32(pixels  );\
 761             b= LD32(pixels+1);\
 762             l0=  (a&0x03030303UL)\
 763                + (b&0x03030303UL)\
 764                + 0x02020202UL;\
 765             h0= ((a&0xFCFCFCFCUL)>>2)\
 766               + ((b&0xFCFCFCFCUL)>>2);\
 767             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 768             pixels+=line_size;\
 769             block +=line_size;\
 770         }\
 771 }\
 772 \
 773 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 774 {\
 775     int j;\
 776     for(j=0; j<2; j++){\
 777         int i;\
 778         const uint32_t a= LD32(pixels  );\
 779         const uint32_t b= LD32(pixels+1);\
 780         uint32_t l0=  (a&0x03030303UL)\
 781                     + (b&0x03030303UL)\
 782                     + 0x02020202UL;\
 783         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 784                    + ((b&0xFCFCFCFCUL)>>2);\
 785         uint32_t l1,h1;\
 786 \
 787         pixels+=line_size;\
 788         for(i=0; i<h; i+=2){\
 789             uint32_t a= LD32(pixels  );\
 790             uint32_t b= LD32(pixels+1);\
 791             l1=  (a&0x03030303UL)\
 792                + (b&0x03030303UL);\
 793             h1= ((a&0xFCFCFCFCUL)>>2)\
 794               + ((b&0xFCFCFCFCUL)>>2);\
 795             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 796             pixels+=line_size;\
 797             block +=line_size;\
 798             a= LD32(pixels  );\
 799             b= LD32(pixels+1);\
 800             l0=  (a&0x03030303UL)\
 801                + (b&0x03030303UL)\
 802                + 0x02020202UL;\
 803             h0= ((a&0xFCFCFCFCUL)>>2)\
 804               + ((b&0xFCFCFCFCUL)>>2);\
 805             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 806             pixels+=line_size;\
 807             block +=line_size;\
 808         }\
 809         pixels+=4-line_size*(h+1);\
 810         block +=4-line_size*h;\
 811     }\
 812 }\
 813 \
 814 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 815 {\
 816     int j;\
 817     for(j=0; j<2; j++){\
 818         int i;\
 819         const uint32_t a= LD32(pixels  );\
 820         const uint32_t b= LD32(pixels+1);\
 821         uint32_t l0=  (a&0x03030303UL)\
 822                     + (b&0x03030303UL)\
 823                     + 0x01010101UL;\
 824         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 825                    + ((b&0xFCFCFCFCUL)>>2);\
 826         uint32_t l1,h1;\
 827 \
 828         pixels+=line_size;\
 829         for(i=0; i<h; i+=2){\
 830             uint32_t a= LD32(pixels  );\
 831             uint32_t b= LD32(pixels+1);\
 832             l1=  (a&0x03030303UL)\
 833                + (b&0x03030303UL);\
 834             h1= ((a&0xFCFCFCFCUL)>>2)\
 835               + ((b&0xFCFCFCFCUL)>>2);\
 836             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 837             pixels+=line_size;\
 838             block +=line_size;\
 839             a= LD32(pixels  );\
 840             b= LD32(pixels+1);\
 841             l0=  (a&0x03030303UL)\
 842                + (b&0x03030303UL)\
 843                + 0x01010101UL;\
 844             h0= ((a&0xFCFCFCFCUL)>>2)\
 845               + ((b&0xFCFCFCFCUL)>>2);\
 846             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 847             pixels+=line_size;\
 848             block +=line_size;\
 849         }\
 850         pixels+=4-line_size*(h+1);\
 851         block +=4-line_size*h;\
 852     }\
 853 }\
 854 \
 855 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
 856 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
 857 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
 858 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
 859 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
 860 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
 861 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
 862 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
 863
 864 #define op_avg(a, b) a = rnd_avg32(a, b)
 865 #endif
 866 #define op_put(a, b) a = b
 867
 868 PIXOP2(avg, op_avg)
 869 PIXOP2(put, op_put)
 870 #undef op_avg
 871 #undef op_put
 872
 873 #define avg2(a,b) ((a+b+1)>>1)
 874 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 875
 876
 877 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 878 {
 879     const int A=(16-x16)*(16-y16);
 880     const int B=(   x16)*(16-y16);
 881     const int C=(16-x16)*(   y16);
 882     const int D=(   x16)*(   y16);
 883     int i;
 884
 885     for(i=0; i<h; i++)
 886     {
 887         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 888         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 889         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 890         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 891         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 892         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 893         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 894         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 895         dst+= stride;
 896         src+= stride;
 897     }
 898 }
 899
 900 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 901                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 902 {
 903     int y, vx, vy;
 904     const int s= 1<<shift;
 905
 906     width--;
 907     height--;
 908
 909     for(y=0; y<h; y++){
 910         int x;
 911
 912         vx= ox;
 913         vy= oy;
 914         for(x=0; x<8; x++){ //XXX FIXME optimize
 915             int src_x, src_y, frac_x, frac_y, index;
 916
 917             src_x= vx>>16;
 918             src_y= vy>>16;
 919             frac_x= src_x&(s-1);
 920             frac_y= src_y&(s-1);
 921             src_x>>=shift;
 922             src_y>>=shift;
 923
 924             if((unsigned)src_x < width){
 925                 if((unsigned)src_y < height){
 926                     index= src_x + src_y*stride;
 927                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 928                                            + src[index       +1]*   frac_x )*(s-frac_y)
 929                                         + (  src[index+stride  ]*(s-frac_x)
 930                                            + src[index+stride+1]*   frac_x )*   frac_y
 931                                         + r)>>(shift*2);
 932                 }else{
 933                     index= src_x + clip(src_y, 0, height)*stride;
 934                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 935                                           + src[index       +1]*   frac_x )*s
 936                                         + r)>>(shift*2);
 937                 }
 938             }else{
 939                 if((unsigned)src_y < height){
 940                     index= clip(src_x, 0, width) + src_y*stride;
 941                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 942                                            + src[index+stride  ]*   frac_y )*s
 943                                         + r)>>(shift*2);
 944                 }else{
 945                     index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
 946                     dst[y*stride + x]=    src[index         ];
 947                 }
 948             }
 949
 950             vx+= dxx;
 951             vy+= dyx;
 952         }
 953         ox += dxy;
 954         oy += dyy;
 955     }
 956 }
 957
 958 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 959     switch(width){
 960     case 2: put_pixels2_c (dst, src, stride, height); break;
 961     case 4: put_pixels4_c (dst, src, stride, height); break;
 962     case 8: put_pixels8_c (dst, src, stride, height); break;
 963     case 16:put_pixels16_c(dst, src, stride, height); break;
 964     }
 965 }
 966
 967 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 968     int i,j;
 969     for (i=0; i < height; i++) {
 970       for (j=0; j < width; j++) {
 971         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 972       }
 973       src += stride;
 974       dst += stride;
 975     }
 976 }
 977
 978 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 979     int i,j;
 980     for (i=0; i < height; i++) {
 981       for (j=0; j < width; j++) {
 982         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 983       }
 984       src += stride;
 985       dst += stride;
 986     }
 987 }
 988
 989 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 990     int i,j;
 991     for (i=0; i < height; i++) {
 992       for (j=0; j < width; j++) {
 993         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 994       }
 995       src += stride;
 996       dst += stride;
 997     }
 998 }
 999
1000 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1001     int i,j;
1002     for (i=0; i < height; i++) {
1003       for (j=0; j < width; j++) {
1004         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1005       }
1006       src += stride;
1007       dst += stride;
1008     }
1009 }
1010
1011 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1012     int i,j;
1013     for (i=0; i < height; i++) {
1014       for (j=0; j < width; j++) {
1015         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1016       }
1017       src += stride;
1018       dst += stride;
1019     }
1020 }
1021
1022 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1023     int i,j;
1024     for (i=0; i < height; i++) {
1025       for (j=0; j < width; j++) {
1026         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1027       }
1028       src += stride;
1029       dst += stride;
1030     }
1031 }
1032
1033 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1034     int i,j;
1035     for (i=0; i < height; i++) {
1036       for (j=0; j < width; j++) {
1037         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1038       }
1039       src += stride;
1040       dst += stride;
1041     }
1042 }
1043
1044 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1045     int i,j;
1046     for (i=0; i < height; i++) {
1047       for (j=0; j < width; j++) {
1048         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1049       }
1050       src += stride;
1051       dst += stride;
1052     }
1053 }
1054
1055 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1056     switch(width){
1057     case 2: avg_pixels2_c (dst, src, stride, height); break;
1058     case 4: avg_pixels4_c (dst, src, stride, height); break;
1059     case 8: avg_pixels8_c (dst, src, stride, height); break;
1060     case 16:avg_pixels16_c(dst, src, stride, height); break;
1061     }
1062 }
1063
1064 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1065     int i,j;
1066     for (i=0; i < height; i++) {
1067       for (j=0; j < width; j++) {
1068         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1069       }
1070       src += stride;
1071       dst += stride;
1072     }
1073 }
1074
1075 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1076     int i,j;
1077     for (i=0; i < height; i++) {
1078       for (j=0; j < width; j++) {
1079         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1080       }
1081       src += stride;
1082       dst += stride;
1083     }
1084 }
1085
1086 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1087     int i,j;
1088     for (i=0; i < height; i++) {
1089       for (j=0; j < width; j++) {
1090         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1091       }
1092       src += stride;
1093       dst += stride;
1094     }
1095 }
1096
1097 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1098     int i,j;
1099     for (i=0; i < height; i++) {
1100       for (j=0; j < width; j++) {
1101         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1102       }
1103       src += stride;
1104       dst += stride;
1105     }
1106 }
1107
1108 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1109     int i,j;
1110     for (i=0; i < height; i++) {
1111       for (j=0; j < width; j++) {
1112         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1113       }
1114       src += stride;
1115       dst += stride;
1116     }
1117 }
1118
1119 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1120     int i,j;
1121     for (i=0; i < height; i++) {
1122       for (j=0; j < width; j++) {
1123         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1124       }
1125       src += stride;
1126       dst += stride;
1127     }
1128 }
1129
1130 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1131     int i,j;
1132     for (i=0; i < height; i++) {
1133       for (j=0; j < width; j++) {
1134         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1135       }
1136       src += stride;
1137       dst += stride;
1138     }
1139 }
1140
1141 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1142     int i,j;
1143     for (i=0; i < height; i++) {
1144       for (j=0; j < width; j++) {
1145         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1146       }
1147       src += stride;
1148       dst += stride;
1149     }
1150 }
1151 #if 0
1152 #define TPEL_WIDTH(width)\
1153 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1154     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1155 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1156     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1157 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1158     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1159 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1160     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1161 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1162     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1163 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1164     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1165 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1166     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1167 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1168     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1169 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1170     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1171 #endif
1172
1173 #define H264_CHROMA_MC(OPNAME, OP)\
1174 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1175     const int A=(8-x)*(8-y);\
1176     const int B=(  x)*(8-y);\
1177     const int C=(8-x)*(  y);\
1178     const int D=(  x)*(  y);\
1179     int i;\
1180     \
1181     assert(x<8 && y<8 && x>=0 && y>=0);\
1182 \
1183     for(i=0; i<h; i++)\
1184     {\
1185         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1186         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1187         dst+= stride;\
1188         src+= stride;\
1189     }\
1190 }\
1191 \
1192 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1193     const int A=(8-x)*(8-y);\
1194     const int B=(  x)*(8-y);\
1195     const int C=(8-x)*(  y);\
1196     const int D=(  x)*(  y);\
1197     int i;\
1198     \
1199     assert(x<8 && y<8 && x>=0 && y>=0);\
1200 \
1201     for(i=0; i<h; i++)\
1202     {\
1203         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1204         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1205         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1206         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1207         dst+= stride;\
1208         src+= stride;\
1209     }\
1210 }\
1211 \
1212 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1213     const int A=(8-x)*(8-y);\
1214     const int B=(  x)*(8-y);\
1215     const int C=(8-x)*(  y);\
1216     const int D=(  x)*(  y);\
1217     int i;\
1218     \
1219     assert(x<8 && y<8 && x>=0 && y>=0);\
1220 \
1221     for(i=0; i<h; i++)\
1222     {\
1223         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1224         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1225         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1226         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1227         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1228         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1229         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1230         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1231         dst+= stride;\
1232         src+= stride;\
1233     }\
1234 }
1235
1236 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1237 #define op_put(a, b) a = (((b) + 32)>>6)
1238
1239 H264_CHROMA_MC(put_       , op_put)
1240 H264_CHROMA_MC(avg_       , op_avg)
1241 #undef op_avg
1242 #undef op_put
1243
1244 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1245 {
1246     int i;
1247     for(i=0; i<h; i++)
1248     {
1249         ST32(dst   , LD32(src   ));
1250         dst+=dstStride;
1251         src+=srcStride;
1252     }
1253 }
1254
1255 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1256 {
1257     int i;
1258     for(i=0; i<h; i++)
1259     {
1260         ST32(dst   , LD32(src   ));
1261         ST32(dst+4 , LD32(src+4 ));
1262         dst+=dstStride;
1263         src+=srcStride;
1264     }
1265 }
1266
1267 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1268 {
1269     int i;
1270     for(i=0; i<h; i++)
1271     {
1272         ST32(dst   , LD32(src   ));
1273         ST32(dst+4 , LD32(src+4 ));
1274         ST32(dst+8 , LD32(src+8 ));
1275         ST32(dst+12, LD32(src+12));
1276         dst+=dstStride;
1277         src+=srcStride;
1278     }
1279 }
1280
1281 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1282 {
1283     int i;
1284     for(i=0; i<h; i++)
1285     {
1286         ST32(dst   , LD32(src   ));
1287         ST32(dst+4 , LD32(src+4 ));
1288         ST32(dst+8 , LD32(src+8 ));
1289         ST32(dst+12, LD32(src+12));
1290         dst[16]= src[16];
1291         dst+=dstStride;
1292         src+=srcStride;
1293     }
1294 }
1295
1296 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1297 {
1298     int i;
1299     for(i=0; i<h; i++)
1300     {
1301         ST32(dst   , LD32(src   ));
1302         ST32(dst+4 , LD32(src+4 ));
1303         dst[8]= src[8];
1304         dst+=dstStride;
1305         src+=srcStride;
1306     }
1307 }
1308
1309
1310 #define QPEL_MC(r, OPNAME, RND, OP) \
1311 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1312     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1313     int i;\
1314     for(i=0; i<h; i++)\
1315     {\
1316         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1317         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1318         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1319         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1320         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1321         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1322         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1323         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1324         dst+=dstStride;\
1325         src+=srcStride;\
1326     }\
1327 }\
1328 \
1329 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1330     const int w=8;\
1331     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1332     int i;\
1333     for(i=0; i<w; i++)\
1334     {\
1335         const int src0= src[0*srcStride];\
1336         const int src1= src[1*srcStride];\
1337         const int src2= src[2*srcStride];\
1338         const int src3= src[3*srcStride];\
1339         const int src4= src[4*srcStride];\
1340         const int src5= src[5*srcStride];\
1341         const int src6= src[6*srcStride];\
1342         const int src7= src[7*srcStride];\
1343         const int src8= src[8*srcStride];\
1344         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1345         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1346         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1347         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1348         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1349         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1350         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1351         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1352         dst++;\
1353         src++;\
1354     }\
1355 }\
1356 \
1357 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1358     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1359     int i;\
1360     \
1361     for(i=0; i<h; i++)\
1362     {\
1363         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1364         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1365         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1366         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1367         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1368         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1369         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1370         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1371         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1372         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1373         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1374         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1375         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1376         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1377         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1378         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1379         dst+=dstStride;\
1380         src+=srcStride;\
1381     }\
1382 }\
1383 \
1384 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1385     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1386     int i;\
1387     const int w=16;\
1388     for(i=0; i<w; i++)\
1389     {\
1390         const int src0= src[0*srcStride];\
1391         const int src1= src[1*srcStride];\
1392         const int src2= src[2*srcStride];\
1393         const int src3= src[3*srcStride];\
1394         const int src4= src[4*srcStride];\
1395         const int src5= src[5*srcStride];\
1396         const int src6= src[6*srcStride];\
1397         const int src7= src[7*srcStride];\
1398         const int src8= src[8*srcStride];\
1399         const int src9= src[9*srcStride];\
1400         const int src10= src[10*srcStride];\
1401         const int src11= src[11*srcStride];\
1402         const int src12= src[12*srcStride];\
1403         const int src13= src[13*srcStride];\
1404         const int src14= src[14*srcStride];\
1405         const int src15= src[15*srcStride];\
1406         const int src16= src[16*srcStride];\
1407         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1408         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1409         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1410         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1411         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1412         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1413         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1414         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1415         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1416         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1417         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1418         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1419         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1420         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1421         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1422         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1423         dst++;\
1424         src++;\
1425     }\
1426 }\
1427 \
1428 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1429     OPNAME ## pixels8_c(dst, src, stride, 8);\
1430 }\
1431 \
1432 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1433     uint8_t half[64];\
1434     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1435     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1436 }\
1437 \
1438 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1439     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1440 }\
1441 \
1442 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1443     uint8_t half[64];\
1444     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1445     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1446 }\
1447 \
1448 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1449     uint8_t full[16*9];\
1450     uint8_t half[64];\
1451     copy_block9(full, src, 16, stride, 9);\
1452     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1453     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1454 }\
1455 \
1456 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1457     uint8_t full[16*9];\
1458     copy_block9(full, src, 16, stride, 9);\
1459     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1460 }\
1461 \
1462 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1463     uint8_t full[16*9];\
1464     uint8_t half[64];\
1465     copy_block9(full, src, 16, stride, 9);\
1466     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1467     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1468 }\
1469 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1470     uint8_t full[16*9];\
1471     uint8_t halfH[72];\
1472     uint8_t halfV[64];\
1473     uint8_t halfHV[64];\
1474     copy_block9(full, src, 16, stride, 9);\
1475     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1476     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1477     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1478     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1479 }\
1480 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1481     uint8_t full[16*9];\
1482     uint8_t halfH[72];\
1483     uint8_t halfHV[64];\
1484     copy_block9(full, src, 16, stride, 9);\
1485     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1486     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1487     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1488     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1489 }\
1490 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1491     uint8_t full[16*9];\
1492     uint8_t halfH[72];\
1493     uint8_t halfV[64];\
1494     uint8_t halfHV[64];\
1495     copy_block9(full, src, 16, stride, 9);\
1496     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1497     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1498     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1499     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1500 }\
1501 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1502     uint8_t full[16*9];\
1503     uint8_t halfH[72];\
1504     uint8_t halfHV[64];\
1505     copy_block9(full, src, 16, stride, 9);\
1506     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1507     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1508     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1509     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1510 }\
1511 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1512     uint8_t full[16*9];\
1513     uint8_t halfH[72];\
1514     uint8_t halfV[64];\
1515     uint8_t halfHV[64];\
1516     copy_block9(full, src, 16, stride, 9);\
1517     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1518     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1519     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1520     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1521 }\
1522 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1523     uint8_t full[16*9];\
1524     uint8_t halfH[72];\
1525     uint8_t halfHV[64];\
1526     copy_block9(full, src, 16, stride, 9);\
1527     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1528     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1529     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1530     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1531 }\
1532 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1533     uint8_t full[16*9];\
1534     uint8_t halfH[72];\
1535     uint8_t halfV[64];\
1536     uint8_t halfHV[64];\
1537     copy_block9(full, src, 16, stride, 9);\
1538     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1539     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1540     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1541     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1542 }\
1543 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1544     uint8_t full[16*9];\
1545     uint8_t halfH[72];\
1546     uint8_t halfHV[64];\
1547     copy_block9(full, src, 16, stride, 9);\
1548     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1549     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1550     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1551     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1552 }\
1553 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1554     uint8_t halfH[72];\
1555     uint8_t halfHV[64];\
1556     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1557     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1558     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1559 }\
1560 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1561     uint8_t halfH[72];\
1562     uint8_t halfHV[64];\
1563     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1564     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1565     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1566 }\
1567 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1568     uint8_t full[16*9];\
1569     uint8_t halfH[72];\
1570     uint8_t halfV[64];\
1571     uint8_t halfHV[64];\
1572     copy_block9(full, src, 16, stride, 9);\
1573     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1574     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1575     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1576     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1577 }\
1578 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1579     uint8_t full[16*9];\
1580     uint8_t halfH[72];\
1581     copy_block9(full, src, 16, stride, 9);\
1582     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1583     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1584     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1585 }\
1586 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1587     uint8_t full[16*9];\
1588     uint8_t halfH[72];\
1589     uint8_t halfV[64];\
1590     uint8_t halfHV[64];\
1591     copy_block9(full, src, 16, stride, 9);\
1592     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1593     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1594     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1595     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1596 }\
1597 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1598     uint8_t full[16*9];\
1599     uint8_t halfH[72];\
1600     copy_block9(full, src, 16, stride, 9);\
1601     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1602     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1603     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1604 }\
1605 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1606     uint8_t halfH[72];\
1607     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1608     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1609 }\
1610 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1611     OPNAME ## pixels16_c(dst, src, stride, 16);\
1612 }\
1613 \
1614 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1615     uint8_t half[256];\
1616     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1617     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1618 }\
1619 \
1620 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1621     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1622 }\
1623 \
1624 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1625     uint8_t half[256];\
1626     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1627     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1628 }\
1629 \
1630 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1631     uint8_t full[24*17];\
1632     uint8_t half[256];\
1633     copy_block17(full, src, 24, stride, 17);\
1634     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1635     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1636 }\
1637 \
1638 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1639     uint8_t full[24*17];\
1640     copy_block17(full, src, 24, stride, 17);\
1641     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1642 }\
1643 \
1644 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1645     uint8_t full[24*17];\
1646     uint8_t half[256];\
1647     copy_block17(full, src, 24, stride, 17);\
1648     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1649     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1650 }\
1651 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1652     uint8_t full[24*17];\
1653     uint8_t halfH[272];\
1654     uint8_t halfV[256];\
1655     uint8_t halfHV[256];\
1656     copy_block17(full, src, 24, stride, 17);\
1657     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1658     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1659     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1660     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1661 }\
1662 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1663     uint8_t full[24*17];\
1664     uint8_t halfH[272];\
1665     uint8_t halfHV[256];\
1666     copy_block17(full, src, 24, stride, 17);\
1667     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1668     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1669     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1670     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1671 }\
1672 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1673     uint8_t full[24*17];\
1674     uint8_t halfH[272];\
1675     uint8_t halfV[256];\
1676     uint8_t halfHV[256];\
1677     copy_block17(full, src, 24, stride, 17);\
1678     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1679     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1680     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1681     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1682 }\
1683 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1684     uint8_t full[24*17];\
1685     uint8_t halfH[272];\
1686     uint8_t halfHV[256];\
1687     copy_block17(full, src, 24, stride, 17);\
1688     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1689     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1690     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1691     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1692 }\
1693 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1694     uint8_t full[24*17];\
1695     uint8_t halfH[272];\
1696     uint8_t halfV[256];\
1697     uint8_t halfHV[256];\
1698     copy_block17(full, src, 24, stride, 17);\
1699     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1700     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1701     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1702     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1703 }\
1704 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1705     uint8_t full[24*17];\
1706     uint8_t halfH[272];\
1707     uint8_t halfHV[256];\
1708     copy_block17(full, src, 24, stride, 17);\
1709     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1710     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1711     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1712     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1713 }\
1714 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1715     uint8_t full[24*17];\
1716     uint8_t halfH[272];\
1717     uint8_t halfV[256];\
1718     uint8_t halfHV[256];\
1719     copy_block17(full, src, 24, stride, 17);\
1720     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1721     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1722     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1723     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1724 }\
1725 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1726     uint8_t full[24*17];\
1727     uint8_t halfH[272];\
1728     uint8_t halfHV[256];\
1729     copy_block17(full, src, 24, stride, 17);\
1730     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1731     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1732     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1733     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1734 }\
1735 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1736     uint8_t halfH[272];\
1737     uint8_t halfHV[256];\
1738     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1739     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1740     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1741 }\
1742 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1743     uint8_t halfH[272];\
1744     uint8_t halfHV[256];\
1745     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1746     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1747     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1748 }\
1749 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1750     uint8_t full[24*17];\
1751     uint8_t halfH[272];\
1752     uint8_t halfV[256];\
1753     uint8_t halfHV[256];\
1754     copy_block17(full, src, 24, stride, 17);\
1755     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1756     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1757     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1758     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1759 }\
1760 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1761     uint8_t full[24*17];\
1762     uint8_t halfH[272];\
1763     copy_block17(full, src, 24, stride, 17);\
1764     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1765     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1766     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1767 }\
1768 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1769     uint8_t full[24*17];\
1770     uint8_t halfH[272];\
1771     uint8_t halfV[256];\
1772     uint8_t halfHV[256];\
1773     copy_block17(full, src, 24, stride, 17);\
1774     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1775     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1776     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1777     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1778 }\
1779 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1780     uint8_t full[24*17];\
1781     uint8_t halfH[272];\
1782     copy_block17(full, src, 24, stride, 17);\
1783     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1784     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1785     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1786 }\
1787 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1788     uint8_t halfH[272];\
1789     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1790     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1791 }
1792
1793 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1794 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1795 #define op_put(a, b) a = cm[((b) + 16)>>5]
1796 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1797
1798 QPEL_MC(0, put_       , _       , op_put)
1799 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1800 QPEL_MC(0, avg_       , _       , op_avg)
1801 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1802 #undef op_avg
1803 #undef op_avg_no_rnd
1804 #undef op_put
1805 #undef op_put_no_rnd
1806
1807 #if 1
1808 #define H264_LOWPASS(OPNAME, OP, OP2) \
1809 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1810     const int h=4;\
1811     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1812     int i;\
1813     for(i=0; i<h; i++)\
1814     {\
1815         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1816         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1817         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1818         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1819         dst+=dstStride;\
1820         src+=srcStride;\
1821     }\
1822 }\
1823 \
1824 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1825     const int w=4;\
1826     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1827     int i;\
1828     for(i=0; i<w; i++)\
1829     {\
1830         const int srcB= src[-2*srcStride];\
1831         const int srcA= src[-1*srcStride];\
1832         const int src0= src[0 *srcStride];\
1833         const int src1= src[1 *srcStride];\
1834         const int src2= src[2 *srcStride];\
1835         const int src3= src[3 *srcStride];\
1836         const int src4= src[4 *srcStride];\
1837         const int src5= src[5 *srcStride];\
1838         const int src6= src[6 *srcStride];\
1839         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1840         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1841         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1842         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1843         dst++;\
1844         src++;\
1845     }\
1846 }\
1847 \
1848 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1849     const int h=4;\
1850     const int w=4;\
1851     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1852     int i;\
1853     src -= 2*srcStride;\
1854     for(i=0; i<h+5; i++)\
1855     {\
1856         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1857         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1858         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1859         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1860         tmp+=tmpStride;\
1861         src+=srcStride;\
1862     }\
1863     tmp -= tmpStride*(h+5-2);\
1864     for(i=0; i<w; i++)\
1865     {\
1866         const int tmpB= tmp[-2*tmpStride];\
1867         const int tmpA= tmp[-1*tmpStride];\
1868         const int tmp0= tmp[0 *tmpStride];\
1869         const int tmp1= tmp[1 *tmpStride];\
1870         const int tmp2= tmp[2 *tmpStride];\
1871         const int tmp3= tmp[3 *tmpStride];\
1872         const int tmp4= tmp[4 *tmpStride];\
1873         const int tmp5= tmp[5 *tmpStride];\
1874         const int tmp6= tmp[6 *tmpStride];\
1875         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1876         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1877         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1878         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1879         dst++;\
1880         tmp++;\
1881     }\
1882 }\
1883 \
1884 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1885     const int h=8;\
1886     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1887     int i;\
1888     for(i=0; i<h; i++)\
1889     {\
1890         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1891         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1892         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1893         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1894         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1895         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1896         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1897         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1898         dst+=dstStride;\
1899         src+=srcStride;\
1900     }\
1901 }\
1902 \
1903 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1904     const int w=8;\
1905     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1906     int i;\
1907     for(i=0; i<w; i++)\
1908     {\
1909         const int srcB= src[-2*srcStride];\
1910         const int srcA= src[-1*srcStride];\
1911         const int src0= src[0 *srcStride];\
1912         const int src1= src[1 *srcStride];\
1913         const int src2= src[2 *srcStride];\
1914         const int src3= src[3 *srcStride];\
1915         const int src4= src[4 *srcStride];\
1916         const int src5= src[5 *srcStride];\
1917         const int src6= src[6 *srcStride];\
1918         const int src7= src[7 *srcStride];\
1919         const int src8= src[8 *srcStride];\
1920         const int src9= src[9 *srcStride];\
1921         const int src10=src[10*srcStride];\
1922         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1923         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1924         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1925         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1926         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1927         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1928         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1929         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1930         dst++;\
1931         src++;\
1932     }\
1933 }\
1934 \
1935 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1936     const int h=8;\
1937     const int w=8;\
1938     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1939     int i;\
1940     src -= 2*srcStride;\
1941     for(i=0; i<h+5; i++)\
1942     {\
1943         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1944         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1945         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1946         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1947         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1948         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1949         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1950         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1951         tmp+=tmpStride;\
1952         src+=srcStride;\
1953     }\
1954     tmp -= tmpStride*(h+5-2);\
1955     for(i=0; i<w; i++)\
1956     {\
1957         const int tmpB= tmp[-2*tmpStride];\
1958         const int tmpA= tmp[-1*tmpStride];\
1959         const int tmp0= tmp[0 *tmpStride];\
1960         const int tmp1= tmp[1 *tmpStride];\
1961         const int tmp2= tmp[2 *tmpStride];\
1962         const int tmp3= tmp[3 *tmpStride];\
1963         const int tmp4= tmp[4 *tmpStride];\
1964         const int tmp5= tmp[5 *tmpStride];\
1965         const int tmp6= tmp[6 *tmpStride];\
1966         const int tmp7= tmp[7 *tmpStride];\
1967         const int tmp8= tmp[8 *tmpStride];\
1968         const int tmp9= tmp[9 *tmpStride];\
1969         const int tmp10=tmp[10*tmpStride];\
1970         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1971         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1972         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1973         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1974         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1975         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1976         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1977         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1978         dst++;\
1979         tmp++;\
1980     }\
1981 }\
1982 \
1983 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1984     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1985     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1986     src += 8*srcStride;\
1987     dst += 8*dstStride;\
1988     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1989     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1990 }\
1991 \
1992 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1993     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1994     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1995     src += 8*srcStride;\
1996     dst += 8*dstStride;\
1997     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1998     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1999 }\
2000 \
2001 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2002     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2003     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2004     src += 8*srcStride;\
2005     tmp += 8*tmpStride;\
2006     dst += 8*dstStride;\
2007     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2008     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2009 }\
2010
2011 #define H264_MC(OPNAME, SIZE) \
2012 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2013     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2014 }\
2015 \
2016 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2017     uint8_t half[SIZE*SIZE];\
2018     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2019     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2020 }\
2021 \
2022 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2023     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2024 }\
2025 \
2026 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2027     uint8_t half[SIZE*SIZE];\
2028     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2029     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2030 }\
2031 \
2032 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2033     uint8_t full[SIZE*(SIZE+5)];\
2034     uint8_t * const full_mid= full + SIZE*2;\
2035     uint8_t half[SIZE*SIZE];\
2036     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2037     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2038     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2039 }\
2040 \
2041 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2042     uint8_t full[SIZE*(SIZE+5)];\
2043     uint8_t * const full_mid= full + SIZE*2;\
2044     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2045     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2046 }\
2047 \
2048 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2049     uint8_t full[SIZE*(SIZE+5)];\
2050     uint8_t * const full_mid= full + SIZE*2;\
2051     uint8_t half[SIZE*SIZE];\
2052     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2053     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2054     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2055 }\
2056 \
2057 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2058     uint8_t full[SIZE*(SIZE+5)];\
2059     uint8_t * const full_mid= full + SIZE*2;\
2060     uint8_t halfH[SIZE*SIZE];\
2061     uint8_t halfV[SIZE*SIZE];\
2062     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2063     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2064     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2065     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2066 }\
2067 \
2068 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2069     uint8_t full[SIZE*(SIZE+5)];\
2070     uint8_t * const full_mid= full + SIZE*2;\
2071     uint8_t halfH[SIZE*SIZE];\
2072     uint8_t halfV[SIZE*SIZE];\
2073     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2074     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2075     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2076     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2077 }\
2078 \
2079 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2080     uint8_t full[SIZE*(SIZE+5)];\
2081     uint8_t * const full_mid= full + SIZE*2;\
2082     uint8_t halfH[SIZE*SIZE];\
2083     uint8_t halfV[SIZE*SIZE];\
2084     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2085     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2086     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2087     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2088 }\
2089 \
2090 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2091     uint8_t full[SIZE*(SIZE+5)];\
2092     uint8_t * const full_mid= full + SIZE*2;\
2093     uint8_t halfH[SIZE*SIZE];\
2094     uint8_t halfV[SIZE*SIZE];\
2095     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2096     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2097     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2098     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2099 }\
2100 \
2101 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2102     int16_t tmp[SIZE*(SIZE+5)];\
2103     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2104 }\
2105 \
2106 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2107     int16_t tmp[SIZE*(SIZE+5)];\
2108     uint8_t halfH[SIZE*SIZE];\
2109     uint8_t halfHV[SIZE*SIZE];\
2110     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2111     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2112     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2113 }\
2114 \
2115 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2116     int16_t tmp[SIZE*(SIZE+5)];\
2117     uint8_t halfH[SIZE*SIZE];\
2118     uint8_t halfHV[SIZE*SIZE];\
2119     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2120     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2121     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2122 }\
2123 \
2124 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2125     uint8_t full[SIZE*(SIZE+5)];\
2126     uint8_t * const full_mid= full + SIZE*2;\
2127     int16_t tmp[SIZE*(SIZE+5)];\
2128     uint8_t halfV[SIZE*SIZE];\
2129     uint8_t halfHV[SIZE*SIZE];\
2130     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2131     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2132     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2133     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2134 }\
2135 \
2136 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2137     uint8_t full[SIZE*(SIZE+5)];\
2138     uint8_t * const full_mid= full + SIZE*2;\
2139     int16_t tmp[SIZE*(SIZE+5)];\
2140     uint8_t halfV[SIZE*SIZE];\
2141     uint8_t halfHV[SIZE*SIZE];\
2142     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2143     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2144     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2145     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2146 }\
2147
2148 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2149 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2150 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2151 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2152 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2153
2154 H264_LOWPASS(put_       , op_put, op2_put)
2155 H264_LOWPASS(avg_       , op_avg, op2_avg)
2156 H264_MC(put_, 4)
2157 H264_MC(put_, 8)
2158 H264_MC(put_, 16)
2159 H264_MC(avg_, 4)
2160 H264_MC(avg_, 8)
2161 H264_MC(avg_, 16)
2162
2163 #undef op_avg
2164 #undef op_put
2165 #undef op2_avg
2166 #undef op2_put
2167 #endif
2168
2169 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2170     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2171     int i;
2172
2173     for(i=0; i<h; i++){
2174         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2175         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2176         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2177         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2178         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2179         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2180         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2181         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2182         dst+=dstStride;
2183         src+=srcStride;
2184     }
2185 }
2186
2187 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2188     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2189     int i;
2190
2191     for(i=0; i<w; i++){
2192         const int src_1= src[ -srcStride];
2193         const int src0 = src[0          ];
2194         const int src1 = src[  srcStride];
2195         const int src2 = src[2*srcStride];
2196         const int src3 = src[3*srcStride];
2197         const int src4 = src[4*srcStride];
2198         const int src5 = src[5*srcStride];
2199         const int src6 = src[6*srcStride];
2200         const int src7 = src[7*srcStride];
2201         const int src8 = src[8*srcStride];
2202         const int src9 = src[9*srcStride];
2203         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2204         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2205         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2206         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2207         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2208         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2209         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2210         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2211         src++;
2212         dst++;
2213     }
2214 }
2215
2216 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2217     put_pixels8_c(dst, src, stride, 8);
2218 }
2219
2220 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2221     uint8_t half[64];
2222     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2223     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2224 }
2225
2226 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2227     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2228 }
2229
2230 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2231     uint8_t half[64];
2232     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2233     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2234 }
2235
2236 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2237     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2238 }
2239
2240 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2241     uint8_t halfH[88];
2242     uint8_t halfV[64];
2243     uint8_t halfHV[64];
2244     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2245     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2246     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2247     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2248 }
2249 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2250     uint8_t halfH[88];
2251     uint8_t halfV[64];
2252     uint8_t halfHV[64];
2253     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2254     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2255     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2256     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2257 }
2258 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2259     uint8_t halfH[88];
2260     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2261     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2262 }
2263
2264 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2265     int x;
2266     const int strength= ff_h263_loop_filter_strength[qscale];
2267
2268     for(x=0; x<8; x++){
2269         int d1, d2, ad1;
2270         int p0= src[x-2*stride];
2271         int p1= src[x-1*stride];
2272         int p2= src[x+0*stride];
2273         int p3= src[x+1*stride];
2274         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2275
2276         if     (d<-2*strength) d1= 0;
2277         else if(d<-  strength) d1=-2*strength - d;
2278         else if(d<   strength) d1= d;
2279         else if(d< 2*strength) d1= 2*strength - d;
2280         else                   d1= 0;
2281
2282         p1 += d1;
2283         p2 -= d1;
2284         if(p1&256) p1= ~(p1>>31);
2285         if(p2&256) p2= ~(p2>>31);
2286
2287         src[x-1*stride] = p1;
2288         src[x+0*stride] = p2;
2289
2290         ad1= ABS(d1);
2291
2292         d2= clip((p0-p3)/4, -ad1, ad1);
2293
2294         src[x-2*stride] = p0 - d2;
2295         src[x+  stride] = p3 + d2;
2296     }
2297 }
2298
2299 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2300     int y;
2301     const int strength= ff_h263_loop_filter_strength[qscale];
2302
2303     for(y=0; y<8; y++){
2304         int d1, d2, ad1;
2305         int p0= src[y*stride-2];
2306         int p1= src[y*stride-1];
2307         int p2= src[y*stride+0];
2308         int p3= src[y*stride+1];
2309         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2310
2311         if     (d<-2*strength) d1= 0;
2312         else if(d<-  strength) d1=-2*strength - d;
2313         else if(d<   strength) d1= d;
2314         else if(d< 2*strength) d1= 2*strength - d;
2315         else                   d1= 0;
2316
2317         p1 += d1;
2318         p2 -= d1;
2319         if(p1&256) p1= ~(p1>>31);
2320         if(p2&256) p2= ~(p2>>31);
2321
2322         src[y*stride-1] = p1;
2323         src[y*stride+0] = p2;
2324
2325         ad1= ABS(d1)>>1;
2326
2327         d2= clip((p0-p3)/4, -ad1, ad1);
2328
2329         src[y*stride-2] = p0 - d2;
2330         src[y*stride+1] = p3 + d2;
2331     }
2332 }
2333
2334 static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2335 {
2336     int s, i;
2337
2338     s = 0;
2339     for(i=0;i<16;i++) {
2340         s += abs(pix1[0] - pix2[0]);
2341         s += abs(pix1[1] - pix2[1]);
2342         s += abs(pix1[2] - pix2[2]);
2343         s += abs(pix1[3] - pix2[3]);
2344         s += abs(pix1[4] - pix2[4]);
2345         s += abs(pix1[5] - pix2[5]);
2346         s += abs(pix1[6] - pix2[6]);
2347         s += abs(pix1[7] - pix2[7]);
2348         s += abs(pix1[8] - pix2[8]);
2349         s += abs(pix1[9] - pix2[9]);
2350         s += abs(pix1[10] - pix2[10]);
2351         s += abs(pix1[11] - pix2[11]);
2352         s += abs(pix1[12] - pix2[12]);
2353         s += abs(pix1[13] - pix2[13]);
2354         s += abs(pix1[14] - pix2[14]);
2355         s += abs(pix1[15] - pix2[15]);
2356         pix1 += line_size;
2357         pix2 += line_size;
2358     }
2359     return s;
2360 }
2361
2362 static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2363 {
2364     int s, i;
2365
2366     s = 0;
2367     for(i=0;i<16;i++) {
2368         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2369         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2370         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2371         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2372         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2373         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2374         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2375         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2376         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2377         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2378         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2379         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2380         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2381         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2382         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2383         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2384         pix1 += line_size;
2385         pix2 += line_size;
2386     }
2387     return s;
2388 }
2389
2390 static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2391 {
2392     int s, i;
2393     uint8_t *pix3 = pix2 + line_size;
2394
2395     s = 0;
2396     for(i=0;i<16;i++) {
2397         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2398         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2399         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2400         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2401         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2402         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2403         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2404         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2405         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2406         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2407         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2408         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2409         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2410         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2411         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2412         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2413         pix1 += line_size;
2414         pix2 += line_size;
2415         pix3 += line_size;
2416     }
2417     return s;
2418 }
2419
2420 static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2421 {
2422     int s, i;
2423     uint8_t *pix3 = pix2 + line_size;
2424
2425     s = 0;
2426     for(i=0;i<16;i++) {
2427         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2428         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2429         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2430         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2431         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2432         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2433         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2434         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2435         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2436         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2437         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2438         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2439         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2440         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2441         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2442         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2443         pix1 += line_size;
2444         pix2 += line_size;
2445         pix3 += line_size;
2446     }
2447     return s;
2448 }
2449
2450 static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2451 {
2452     int s, i;
2453
2454     s = 0;
2455     for(i=0;i<8;i++) {
2456         s += abs(pix1[0] - pix2[0]);
2457         s += abs(pix1[1] - pix2[1]);
2458         s += abs(pix1[2] - pix2[2]);
2459         s += abs(pix1[3] - pix2[3]);
2460         s += abs(pix1[4] - pix2[4]);
2461         s += abs(pix1[5] - pix2[5]);
2462         s += abs(pix1[6] - pix2[6]);
2463         s += abs(pix1[7] - pix2[7]);
2464         pix1 += line_size;
2465         pix2 += line_size;
2466     }
2467     return s;
2468 }
2469
2470 static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2471 {
2472     int s, i;
2473
2474     s = 0;
2475     for(i=0;i<8;i++) {
2476         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2477         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2478         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2479         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2480         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2481         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2482         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2483         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2484         pix1 += line_size;
2485         pix2 += line_size;
2486     }
2487     return s;
2488 }
2489
2490 static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2491 {
2492     int s, i;
2493     uint8_t *pix3 = pix2 + line_size;
2494
2495     s = 0;
2496     for(i=0;i<8;i++) {
2497         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2498         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2499         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2500         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2501         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2502         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2503         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2504         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2505         pix1 += line_size;
2506         pix2 += line_size;
2507         pix3 += line_size;
2508     }
2509     return s;
2510 }
2511
2512 static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2513 {
2514     int s, i;
2515     uint8_t *pix3 = pix2 + line_size;
2516
2517     s = 0;
2518     for(i=0;i<8;i++) {
2519         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2520         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2521         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2522         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2523         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2524         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2525         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2526         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2527         pix1 += line_size;
2528         pix2 += line_size;
2529         pix3 += line_size;
2530     }
2531     return s;
2532 }
2533
2534 static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
2535     return pix_abs16x16_c(a,b,stride);
2536 }
2537
2538 static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
2539     return pix_abs8x8_c(a,b,stride);
2540 }
2541
2542 /**
2543  * permutes an 8x8 block.
2544  * @param block the block which will be permuted according to the given permutation vector
2545  * @param permutation the permutation vector
2546  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2547  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2548  *                  (inverse) permutated to scantable order!
2549  */
2550 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2551 {
2552     int i;
2553     DCTELEM temp[64];
2554
2555     if(last<=0) return;
2556     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2557
2558     for(i=0; i<=last; i++){
2559         const int j= scantable[i];
2560         temp[j]= block[j];
2561         block[j]=0;
2562     }
2563
2564     for(i=0; i<=last; i++){
2565         const int j= scantable[i];
2566         const int perm_j= permutation[j];
2567         block[perm_j]= temp[j];
2568     }
2569 }
2570
2571 /**
2572  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2573  */
2574 static void clear_blocks_c(DCTELEM *blocks)
2575 {
2576     memset(blocks, 0, sizeof(DCTELEM)*6*64);
2577 }
2578
2579 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2580     int i;
2581     for(i=0; i+7<w; i+=8){
2582         dst[i+0] += src[i+0];
2583         dst[i+1] += src[i+1];
2584         dst[i+2] += src[i+2];
2585         dst[i+3] += src[i+3];
2586         dst[i+4] += src[i+4];
2587         dst[i+5] += src[i+5];
2588         dst[i+6] += src[i+6];
2589         dst[i+7] += src[i+7];
2590     }
2591     for(; i<w; i++)
2592         dst[i+0] += src[i+0];
2593 }
2594
2595 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2596     int i;
2597     for(i=0; i+7<w; i+=8){
2598         dst[i+0] = src1[i+0]-src2[i+0];
2599         dst[i+1] = src1[i+1]-src2[i+1];
2600         dst[i+2] = src1[i+2]-src2[i+2];
2601         dst[i+3] = src1[i+3]-src2[i+3];
2602         dst[i+4] = src1[i+4]-src2[i+4];
2603         dst[i+5] = src1[i+5]-src2[i+5];
2604         dst[i+6] = src1[i+6]-src2[i+6];
2605         dst[i+7] = src1[i+7]-src2[i+7];
2606     }
2607     for(; i<w; i++)
2608         dst[i+0] = src1[i+0]-src2[i+0];
2609 }
2610
2611 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2612     int i;
2613     uint8_t l, lt;
2614
2615     l= *left;
2616     lt= *left_top;
2617
2618     for(i=0; i<w; i++){
2619         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2620         lt= src1[i];
2621         l= src2[i];
2622         dst[i]= l - pred;
2623     }
2624
2625     *left= l;
2626     *left_top= lt;
2627 }
2628
2629 #define BUTTERFLY2(o1,o2,i1,i2) \
2630 o1= (i1)+(i2);\
2631 o2= (i1)-(i2);
2632
2633 #define BUTTERFLY1(x,y) \
2634 {\
2635     int a,b;\
2636     a= x;\
2637     b= y;\
2638     x= a+b;\
2639     y= a-b;\
2640 }
2641
2642 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2643
2644 static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
2645     int i;
2646     int temp[64];
2647     int sum=0;
2648
2649     for(i=0; i<8; i++){
2650         //FIXME try pointer walks
2651         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2652         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2653         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2654         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2655
2656         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2657         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2658         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2659         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2660
2661         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2662         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2663         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2664         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2665     }
2666
2667     for(i=0; i<8; i++){
2668         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2669         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2670         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2671         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2672
2673         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2674         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2675         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2676         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2677
2678         sum +=
2679              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2680             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2681             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2682             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2683     }
2684 #if 0
2685 static int maxi=0;
2686 if(sum>maxi){
2687     maxi=sum;
2688     printf("MAX:%d\n", maxi);
2689 }
2690 #endif
2691     return sum;
2692 }
2693
2694 static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
2695     int i;
2696     int temp[64];
2697     int sum=0;
2698 //FIXME OOOPS ignore 0 term instead of mean mess
2699     for(i=0; i<8; i++){
2700         //FIXME try pointer walks
2701         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
2702         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
2703         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
2704         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
2705
2706         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2707         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2708         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2709         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2710
2711         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2712         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2713         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2714         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2715     }
2716
2717     for(i=0; i<8; i++){
2718         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2719         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2720         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2721         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2722
2723         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2724         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2725         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2726         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2727
2728         sum +=
2729              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2730             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2731             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2732             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2733     }
2734
2735     return sum;
2736 }
2737
2738 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2739     MpegEncContext * const s= (MpegEncContext *)c;
2740     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2741     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2742     int sum=0, i;
2743
2744     s->dsp.diff_pixels(temp, src1, src2, stride);
2745     s->dsp.fdct(temp);
2746
2747     for(i=0; i<64; i++)
2748         sum+= ABS(temp[i]);
2749
2750     return sum;
2751 }
2752
2753 void simple_idct(DCTELEM *block); //FIXME
2754
2755 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2756     MpegEncContext * const s= (MpegEncContext *)c;
2757     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2758     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2759     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2760     int sum=0, i;
2761
2762     s->mb_intra=0;
2763
2764     s->dsp.diff_pixels(temp, src1, src2, stride);
2765
2766     memcpy(bak, temp, 64*sizeof(DCTELEM));
2767
2768     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2769     s->dct_unquantize(s, temp, 0, s->qscale);
2770     simple_idct(temp); //FIXME
2771
2772     for(i=0; i<64; i++)
2773         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2774
2775     return sum;
2776 }
2777
2778 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2779     MpegEncContext * const s= (MpegEncContext *)c;
2780     const uint8_t *scantable= s->intra_scantable.permutated;
2781     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2782     uint64_t __align8 aligned_bak[stride];
2783     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2784     uint8_t * const bak= (uint8_t*)aligned_bak;
2785     int i, last, run, bits, level, distoration, start_i;
2786     const int esc_length= s->ac_esc_length;
2787     uint8_t * length;
2788     uint8_t * last_length;
2789
2790     for(i=0; i<8; i++){
2791         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2792         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2793     }
2794
2795     s->dsp.diff_pixels(temp, src1, src2, stride);
2796
2797     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2798
2799     bits=0;
2800
2801     if (s->mb_intra) {
2802         start_i = 1;
2803         length     = s->intra_ac_vlc_length;
2804         last_length= s->intra_ac_vlc_last_length;
2805         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2806     } else {
2807         start_i = 0;
2808         length     = s->inter_ac_vlc_length;
2809         last_length= s->inter_ac_vlc_last_length;
2810     }
2811
2812     if(last>=start_i){
2813         run=0;
2814         for(i=start_i; i<last; i++){
2815             int j= scantable[i];
2816             level= temp[j];
2817
2818             if(level){
2819                 level+=64;
2820                 if((level&(~127)) == 0){
2821                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2822                 }else
2823                     bits+= esc_length;
2824                 run=0;
2825             }else
2826                 run++;
2827         }
2828         i= scantable[last];
2829
2830         level= temp[i] + 64;
2831
2832         assert(level - 64);
2833
2834         if((level&(~127)) == 0){
2835             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2836         }else
2837             bits+= esc_length;
2838
2839     }
2840
2841     if(last>=0){
2842         s->dct_unquantize(s, temp, 0, s->qscale);
2843     }
2844
2845     s->dsp.idct_add(bak, stride, temp);
2846
2847     distoration= s->dsp.sse[1](NULL, bak, src1, stride);
2848
2849     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2850 }
2851
2852 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2853     MpegEncContext * const s= (MpegEncContext *)c;
2854     const uint8_t *scantable= s->intra_scantable.permutated;
2855     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2856     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2857     int i, last, run, bits, level, start_i;
2858     const int esc_length= s->ac_esc_length;
2859     uint8_t * length;
2860     uint8_t * last_length;
2861
2862     s->dsp.diff_pixels(temp, src1, src2, stride);
2863
2864     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2865
2866     bits=0;
2867
2868     if (s->mb_intra) {
2869         start_i = 1;
2870         length     = s->intra_ac_vlc_length;
2871         last_length= s->intra_ac_vlc_last_length;
2872         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2873     } else {
2874         start_i = 0;
2875         length     = s->inter_ac_vlc_length;
2876         last_length= s->inter_ac_vlc_last_length;
2877     }
2878
2879     if(last>=start_i){
2880         run=0;
2881         for(i=start_i; i<last; i++){
2882             int j= scantable[i];
2883             level= temp[j];
2884
2885             if(level){
2886                 level+=64;
2887                 if((level&(~127)) == 0){
2888                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2889                 }else
2890                     bits+= esc_length;
2891                 run=0;
2892             }else
2893                 run++;
2894         }
2895         i= scantable[last];
2896
2897         level= temp[i] + 64;
2898
2899         assert(level - 64);
2900
2901         if((level&(~127)) == 0){
2902             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2903         }else
2904             bits+= esc_length;
2905     }
2906
2907     return bits;
2908 }
2909
2910
2911 WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
2912 WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
2913 WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
2914 WARPER88_1616(rd8x8_c, rd16x16_c)
2915 WARPER88_1616(bit8x8_c, bit16x16_c)
2916
2917 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2918  converted */
2919 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2920 {
2921     j_rev_dct (block);
2922     put_pixels_clamped_c(block, dest, line_size);
2923 }
2924 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2925 {
2926     j_rev_dct (block);
2927     add_pixels_clamped_c(block, dest, line_size);
2928 }
2929
2930 /* init static data */
2931 void dsputil_static_init(void)
2932 {
2933     int i;
2934
2935     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
2936     for(i=0;i<MAX_NEG_CROP;i++) {
2937         cropTbl[i] = 0;
2938         cropTbl[i + MAX_NEG_CROP + 256] = 255;
2939     }
2940
2941     for(i=0;i<512;i++) {
2942         squareTbl[i] = (i - 256) * (i - 256);
2943     }
2944
2945     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2946 }
2947
2948
2949 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2950 {
2951     int i;
2952
2953 #ifdef CONFIG_ENCODERS
2954     if(avctx->dct_algo==FF_DCT_FASTINT) {
2955         c->fdct = fdct_ifast;
2956         c->fdct248 = fdct_ifast248;
2957     }
2958     else if(avctx->dct_algo==FF_DCT_FAAN) {
2959         c->fdct = ff_faandct;
2960         c->fdct248 = ff_faandct248;
2961     }
2962     else {
2963         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2964         c->fdct248 = ff_fdct248_islow;
2965     }
2966 #endif //CONFIG_ENCODERS
2967
2968     if(avctx->idct_algo==FF_IDCT_INT){
2969         c->idct_put= ff_jref_idct_put;
2970         c->idct_add= ff_jref_idct_add;
2971         c->idct    = j_rev_dct;
2972         c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2973     }else{ //accurate/default
2974         c->idct_put= simple_idct_put;
2975         c->idct_add= simple_idct_add;
2976         c->idct    = simple_idct;
2977         c->idct_permutation_type= FF_NO_IDCT_PERM;
2978     }
2979
2980     c->get_pixels = get_pixels_c;
2981     c->diff_pixels = diff_pixels_c;
2982     c->put_pixels_clamped = put_pixels_clamped_c;
2983     c->add_pixels_clamped = add_pixels_clamped_c;
2984     c->gmc1 = gmc1_c;
2985     c->gmc = gmc_c;
2986     c->clear_blocks = clear_blocks_c;
2987     c->pix_sum = pix_sum_c;
2988     c->pix_norm1 = pix_norm1_c;
2989     c->sse[0]= sse16_c;
2990     c->sse[1]= sse8_c;
2991
2992     /* TODO [0] 16  [1] 8 */
2993     c->pix_abs16x16     = pix_abs16x16_c;
2994     c->pix_abs16x16_x2  = pix_abs16x16_x2_c;
2995     c->pix_abs16x16_y2  = pix_abs16x16_y2_c;
2996     c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
2997     c->pix_abs8x8     = pix_abs8x8_c;
2998     c->pix_abs8x8_x2  = pix_abs8x8_x2_c;
2999     c->pix_abs8x8_y2  = pix_abs8x8_y2_c;
3000     c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
3001
3002 #define dspfunc(PFX, IDX, NUM) \
3003     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3004     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3005     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3006     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3007
3008     dspfunc(put, 0, 16);
3009     dspfunc(put_no_rnd, 0, 16);
3010     dspfunc(put, 1, 8);
3011     dspfunc(put_no_rnd, 1, 8);
3012     dspfunc(put, 2, 4);
3013     dspfunc(put, 3, 2);
3014
3015     dspfunc(avg, 0, 16);
3016     dspfunc(avg_no_rnd, 0, 16);
3017     dspfunc(avg, 1, 8);
3018     dspfunc(avg_no_rnd, 1, 8);
3019     dspfunc(avg, 2, 4);
3020     dspfunc(avg, 3, 2);
3021 #undef dspfunc
3022
3023     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3024     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3025     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3026     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3027     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3028     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3029     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3030     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3031     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3032
3033     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3034     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3035     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3036     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3037     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3038     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3039     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3040     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3041     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3042
3043 #define dspfunc(PFX, IDX, NUM) \
3044     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3045     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3046     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3047     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3048     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3049     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3050     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3051     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3052     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3053     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3054     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3055     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3056     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3057     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3058     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3059     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3060
3061     dspfunc(put_qpel, 0, 16);
3062     dspfunc(put_no_rnd_qpel, 0, 16);
3063
3064     dspfunc(avg_qpel, 0, 16);
3065     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3066
3067     dspfunc(put_qpel, 1, 8);
3068     dspfunc(put_no_rnd_qpel, 1, 8);
3069
3070     dspfunc(avg_qpel, 1, 8);
3071     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3072
3073     dspfunc(put_h264_qpel, 0, 16);
3074     dspfunc(put_h264_qpel, 1, 8);
3075     dspfunc(put_h264_qpel, 2, 4);
3076     dspfunc(avg_h264_qpel, 0, 16);
3077     dspfunc(avg_h264_qpel, 1, 8);
3078     dspfunc(avg_h264_qpel, 2, 4);
3079
3080 #undef dspfunc
3081     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3082     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3083     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3084     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3085     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3086     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3087
3088     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3089     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3090     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3091     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3092     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3093     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3094     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3095     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3096
3097     c->hadamard8_diff[0]= hadamard8_diff16_c;
3098     c->hadamard8_diff[1]= hadamard8_diff_c;
3099     c->hadamard8_abs = hadamard8_abs_c;
3100
3101     c->dct_sad[0]= dct_sad16x16_c;
3102     c->dct_sad[1]= dct_sad8x8_c;
3103
3104     c->sad[0]= sad16x16_c;
3105     c->sad[1]= sad8x8_c;
3106
3107     c->quant_psnr[0]= quant_psnr16x16_c;
3108     c->quant_psnr[1]= quant_psnr8x8_c;
3109
3110     c->rd[0]= rd16x16_c;
3111     c->rd[1]= rd8x8_c;
3112
3113     c->bit[0]= bit16x16_c;
3114     c->bit[1]= bit8x8_c;
3115
3116     c->add_bytes= add_bytes_c;
3117     c->diff_bytes= diff_bytes_c;
3118     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3119     c->bswap_buf= bswap_buf;
3120
3121     c->h263_h_loop_filter= h263_h_loop_filter_c;
3122     c->h263_v_loop_filter= h263_v_loop_filter_c;
3123
3124 #ifdef HAVE_MMX
3125     dsputil_init_mmx(c, avctx);
3126 #endif
3127 #ifdef ARCH_ARMV4L
3128     dsputil_init_armv4l(c, avctx);
3129 #endif
3130 #ifdef HAVE_MLIB
3131     dsputil_init_mlib(c, avctx);
3132 #endif
3133 #ifdef ARCH_ALPHA
3134     dsputil_init_alpha(c, avctx);
3135 #endif
3136 #ifdef ARCH_POWERPC
3137     dsputil_init_ppc(c, avctx);
3138 #endif
3139 #ifdef HAVE_MMI
3140     dsputil_init_mmi(c, avctx);
3141 #endif
3142 #ifdef ARCH_SH4
3143     dsputil_init_sh4(c,avctx);
3144 #endif
3145
3146     switch(c->idct_permutation_type){
3147     case FF_NO_IDCT_PERM:
3148         for(i=0; i<64; i++)
3149             c->idct_permutation[i]= i;
3150         break;
3151     case FF_LIBMPEG2_IDCT_PERM:
3152         for(i=0; i<64; i++)
3153             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3154         break;
3155     case FF_SIMPLE_IDCT_PERM:
3156         for(i=0; i<64; i++)
3157             c->idct_permutation[i]= simple_mmx_permutation[i];
3158         break;
3159     case FF_TRANSPOSE_IDCT_PERM:
3160         for(i=0; i<64; i++)
3161             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3162         break;
3163     default:
3164         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3165     }
3166 }
3167