git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18  *
  19  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  20  */
  21
  22 /**
  23  * @file dsputil.c
  24  * DSP utils
  25  */
  26
  27 #include "avcodec.h"
  28 #include "dsputil.h"
  29 #include "mpegvideo.h"
  30 #include "simple_idct.h"
  31 #include "faandct.h"
  32
  33 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
  34 uint32_t squareTbl[512];
  35
  36 const uint8_t ff_zigzag_direct[64] = {
  37     0,   1,  8, 16,  9,  2,  3, 10,
  38     17, 24, 32, 25, 18, 11,  4,  5,
  39     12, 19, 26, 33, 40, 48, 41, 34,
  40     27, 20, 13,  6,  7, 14, 21, 28,
  41     35, 42, 49, 56, 57, 50, 43, 36,
  42     29, 22, 15, 23, 30, 37, 44, 51,
  43     58, 59, 52, 45, 38, 31, 39, 46,
  44     53, 60, 61, 54, 47, 55, 62, 63
  45 };
  46
  47 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  48    specification, we interleave the fields */
  49 const uint8_t ff_zigzag248_direct[64] = {
  50      0,  8,  1,  9, 16, 24,  2, 10,
  51     17, 25, 32, 40, 48, 56, 33, 41,
  52     18, 26,  3, 11,  4, 12, 19, 27,
  53     34, 42, 49, 57, 50, 58, 35, 43,
  54     20, 28,  5, 13,  6, 14, 21, 29,
  55     36, 44, 51, 59, 52, 60, 37, 45,
  56     22, 30,  7, 15, 23, 31, 38, 46,
  57     53, 61, 54, 62, 39, 47, 55, 63,
  58 };
  59
  60 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  61 uint16_t __align8 inv_zigzag_direct16[64];
  62
  63 const uint8_t ff_alternate_horizontal_scan[64] = {
  64     0,  1,   2,  3,  8,  9, 16, 17,
  65     10, 11,  4,  5,  6,  7, 15, 14,
  66     13, 12, 19, 18, 24, 25, 32, 33,
  67     26, 27, 20, 21, 22, 23, 28, 29,
  68     30, 31, 34, 35, 40, 41, 48, 49,
  69     42, 43, 36, 37, 38, 39, 44, 45,
  70     46, 47, 50, 51, 56, 57, 58, 59,
  71     52, 53, 54, 55, 60, 61, 62, 63,
  72 };
  73
  74 const uint8_t ff_alternate_vertical_scan[64] = {
  75     0,  8,  16, 24,  1,  9,  2, 10,
  76     17, 25, 32, 40, 48, 56, 57, 49,
  77     41, 33, 26, 18,  3, 11,  4, 12,
  78     19, 27, 34, 42, 50, 58, 35, 43,
  79     51, 59, 20, 28,  5, 13,  6, 14,
  80     21, 29, 36, 44, 52, 60, 37, 45,
  81     53, 61, 22, 30,  7, 15, 23, 31,
  82     38, 46, 54, 62, 39, 47, 55, 63,
  83 };
  84
  85 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  86 const uint32_t inverse[256]={
  87          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  88  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
  89  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
  90  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
  91  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
  92  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
  93   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
  94   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
  95   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
  96   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
  97   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
  98   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
  99   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 100   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 101   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 102   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 103   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 104   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 105   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 106   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 107   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 108   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 109   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 110   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 111   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 112   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 113   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 114   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 115   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 116   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 117   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 118   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 119 };
 120
 121 /* Input permutation for the simple_idct_mmx */
 122 static const uint8_t simple_mmx_permutation[64]={
 123         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 124         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 125         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 126         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 127         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 128         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 129         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 130         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 131 };
 132
 133 static int pix_sum_c(uint8_t * pix, int line_size)
 134 {
 135     int s, i, j;
 136
 137     s = 0;
 138     for (i = 0; i < 16; i++) {
 139         for (j = 0; j < 16; j += 8) {
 140             s += pix[0];
 141             s += pix[1];
 142             s += pix[2];
 143             s += pix[3];
 144             s += pix[4];
 145             s += pix[5];
 146             s += pix[6];
 147             s += pix[7];
 148             pix += 8;
 149         }
 150         pix += line_size - 16;
 151     }
 152     return s;
 153 }
 154
 155 static int pix_norm1_c(uint8_t * pix, int line_size)
 156 {
 157     int s, i, j;
 158     uint32_t *sq = squareTbl + 256;
 159
 160     s = 0;
 161     for (i = 0; i < 16; i++) {
 162         for (j = 0; j < 16; j += 8) {
 163 #if 0
 164             s += sq[pix[0]];
 165             s += sq[pix[1]];
 166             s += sq[pix[2]];
 167             s += sq[pix[3]];
 168             s += sq[pix[4]];
 169             s += sq[pix[5]];
 170             s += sq[pix[6]];
 171             s += sq[pix[7]];
 172 #else
 173 #if LONG_MAX > 2147483647
 174             register uint64_t x=*(uint64_t*)pix;
 175             s += sq[x&0xff];
 176             s += sq[(x>>8)&0xff];
 177             s += sq[(x>>16)&0xff];
 178             s += sq[(x>>24)&0xff];
 179             s += sq[(x>>32)&0xff];
 180             s += sq[(x>>40)&0xff];
 181             s += sq[(x>>48)&0xff];
 182             s += sq[(x>>56)&0xff];
 183 #else
 184             register uint32_t x=*(uint32_t*)pix;
 185             s += sq[x&0xff];
 186             s += sq[(x>>8)&0xff];
 187             s += sq[(x>>16)&0xff];
 188             s += sq[(x>>24)&0xff];
 189             x=*(uint32_t*)(pix+4);
 190             s += sq[x&0xff];
 191             s += sq[(x>>8)&0xff];
 192             s += sq[(x>>16)&0xff];
 193             s += sq[(x>>24)&0xff];
 194 #endif
 195 #endif
 196             pix += 8;
 197         }
 198         pix += line_size - 16;
 199     }
 200     return s;
 201 }
 202
 203 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 204     int i;
 205
 206     for(i=0; i+8<=w; i+=8){
 207         dst[i+0]= bswap_32(src[i+0]);
 208         dst[i+1]= bswap_32(src[i+1]);
 209         dst[i+2]= bswap_32(src[i+2]);
 210         dst[i+3]= bswap_32(src[i+3]);
 211         dst[i+4]= bswap_32(src[i+4]);
 212         dst[i+5]= bswap_32(src[i+5]);
 213         dst[i+6]= bswap_32(src[i+6]);
 214         dst[i+7]= bswap_32(src[i+7]);
 215     }
 216     for(;i<w; i++){
 217         dst[i+0]= bswap_32(src[i+0]);
 218     }
 219 }
 220
 221 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
 222 {
 223     int s, i;
 224     uint32_t *sq = squareTbl + 256;
 225
 226     s = 0;
 227     for (i = 0; i < 8; i++) {
 228         s += sq[pix1[0] - pix2[0]];
 229         s += sq[pix1[1] - pix2[1]];
 230         s += sq[pix1[2] - pix2[2]];
 231         s += sq[pix1[3] - pix2[3]];
 232         s += sq[pix1[4] - pix2[4]];
 233         s += sq[pix1[5] - pix2[5]];
 234         s += sq[pix1[6] - pix2[6]];
 235         s += sq[pix1[7] - pix2[7]];
 236         pix1 += line_size;
 237         pix2 += line_size;
 238     }
 239     return s;
 240 }
 241
 242 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
 243 {
 244     int s, i;
 245     uint32_t *sq = squareTbl + 256;
 246
 247     s = 0;
 248     for (i = 0; i < 16; i++) {
 249         s += sq[pix1[ 0] - pix2[ 0]];
 250         s += sq[pix1[ 1] - pix2[ 1]];
 251         s += sq[pix1[ 2] - pix2[ 2]];
 252         s += sq[pix1[ 3] - pix2[ 3]];
 253         s += sq[pix1[ 4] - pix2[ 4]];
 254         s += sq[pix1[ 5] - pix2[ 5]];
 255         s += sq[pix1[ 6] - pix2[ 6]];
 256         s += sq[pix1[ 7] - pix2[ 7]];
 257         s += sq[pix1[ 8] - pix2[ 8]];
 258         s += sq[pix1[ 9] - pix2[ 9]];
 259         s += sq[pix1[10] - pix2[10]];
 260         s += sq[pix1[11] - pix2[11]];
 261         s += sq[pix1[12] - pix2[12]];
 262         s += sq[pix1[13] - pix2[13]];
 263         s += sq[pix1[14] - pix2[14]];
 264         s += sq[pix1[15] - pix2[15]];
 265
 266         pix1 += line_size;
 267         pix2 += line_size;
 268     }
 269     return s;
 270 }
 271
 272 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 273 {
 274     int i;
 275
 276     /* read the pixels */
 277     for(i=0;i<8;i++) {
 278         block[0] = pixels[0];
 279         block[1] = pixels[1];
 280         block[2] = pixels[2];
 281         block[3] = pixels[3];
 282         block[4] = pixels[4];
 283         block[5] = pixels[5];
 284         block[6] = pixels[6];
 285         block[7] = pixels[7];
 286         pixels += line_size;
 287         block += 8;
 288     }
 289 }
 290
 291 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 292                           const uint8_t *s2, int stride){
 293     int i;
 294
 295     /* read the pixels */
 296     for(i=0;i<8;i++) {
 297         block[0] = s1[0] - s2[0];
 298         block[1] = s1[1] - s2[1];
 299         block[2] = s1[2] - s2[2];
 300         block[3] = s1[3] - s2[3];
 301         block[4] = s1[4] - s2[4];
 302         block[5] = s1[5] - s2[5];
 303         block[6] = s1[6] - s2[6];
 304         block[7] = s1[7] - s2[7];
 305         s1 += stride;
 306         s2 += stride;
 307         block += 8;
 308     }
 309 }
 310
 311
 312 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 313                                  int line_size)
 314 {
 315     int i;
 316     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 317
 318     /* read the pixels */
 319     for(i=0;i<8;i++) {
 320         pixels[0] = cm[block[0]];
 321         pixels[1] = cm[block[1]];
 322         pixels[2] = cm[block[2]];
 323         pixels[3] = cm[block[3]];
 324         pixels[4] = cm[block[4]];
 325         pixels[5] = cm[block[5]];
 326         pixels[6] = cm[block[6]];
 327         pixels[7] = cm[block[7]];
 328
 329         pixels += line_size;
 330         block += 8;
 331     }
 332 }
 333
 334 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 335                           int line_size)
 336 {
 337     int i;
 338     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 339
 340     /* read the pixels */
 341     for(i=0;i<8;i++) {
 342         pixels[0] = cm[pixels[0] + block[0]];
 343         pixels[1] = cm[pixels[1] + block[1]];
 344         pixels[2] = cm[pixels[2] + block[2]];
 345         pixels[3] = cm[pixels[3] + block[3]];
 346         pixels[4] = cm[pixels[4] + block[4]];
 347         pixels[5] = cm[pixels[5] + block[5]];
 348         pixels[6] = cm[pixels[6] + block[6]];
 349         pixels[7] = cm[pixels[7] + block[7]];
 350         pixels += line_size;
 351         block += 8;
 352     }
 353 }
 354 #if 0
 355
 356 #define PIXOP2(OPNAME, OP) \
 357 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 358 {\
 359     int i;\
 360     for(i=0; i<h; i++){\
 361         OP(*((uint64_t*)block), LD64(pixels));\
 362         pixels+=line_size;\
 363         block +=line_size;\
 364     }\
 365 }\
 366 \
 367 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 368 {\
 369     int i;\
 370     for(i=0; i<h; i++){\
 371         const uint64_t a= LD64(pixels  );\
 372         const uint64_t b= LD64(pixels+1);\
 373         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 374         pixels+=line_size;\
 375         block +=line_size;\
 376     }\
 377 }\
 378 \
 379 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 380 {\
 381     int i;\
 382     for(i=0; i<h; i++){\
 383         const uint64_t a= LD64(pixels  );\
 384         const uint64_t b= LD64(pixels+1);\
 385         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 386         pixels+=line_size;\
 387         block +=line_size;\
 388     }\
 389 }\
 390 \
 391 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 392 {\
 393     int i;\
 394     for(i=0; i<h; i++){\
 395         const uint64_t a= LD64(pixels          );\
 396         const uint64_t b= LD64(pixels+line_size);\
 397         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 398         pixels+=line_size;\
 399         block +=line_size;\
 400     }\
 401 }\
 402 \
 403 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 404 {\
 405     int i;\
 406     for(i=0; i<h; i++){\
 407         const uint64_t a= LD64(pixels          );\
 408         const uint64_t b= LD64(pixels+line_size);\
 409         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 410         pixels+=line_size;\
 411         block +=line_size;\
 412     }\
 413 }\
 414 \
 415 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 416 {\
 417         int i;\
 418         const uint64_t a= LD64(pixels  );\
 419         const uint64_t b= LD64(pixels+1);\
 420         uint64_t l0=  (a&0x0303030303030303ULL)\
 421                     + (b&0x0303030303030303ULL)\
 422                     + 0x0202020202020202ULL;\
 423         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 424                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 425         uint64_t l1,h1;\
 426 \
 427         pixels+=line_size;\
 428         for(i=0; i<h; i+=2){\
 429             uint64_t a= LD64(pixels  );\
 430             uint64_t b= LD64(pixels+1);\
 431             l1=  (a&0x0303030303030303ULL)\
 432                + (b&0x0303030303030303ULL);\
 433             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 434               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 435             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 436             pixels+=line_size;\
 437             block +=line_size;\
 438             a= LD64(pixels  );\
 439             b= LD64(pixels+1);\
 440             l0=  (a&0x0303030303030303ULL)\
 441                + (b&0x0303030303030303ULL)\
 442                + 0x0202020202020202ULL;\
 443             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 444               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 445             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 446             pixels+=line_size;\
 447             block +=line_size;\
 448         }\
 449 }\
 450 \
 451 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 452 {\
 453         int i;\
 454         const uint64_t a= LD64(pixels  );\
 455         const uint64_t b= LD64(pixels+1);\
 456         uint64_t l0=  (a&0x0303030303030303ULL)\
 457                     + (b&0x0303030303030303ULL)\
 458                     + 0x0101010101010101ULL;\
 459         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 460                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 461         uint64_t l1,h1;\
 462 \
 463         pixels+=line_size;\
 464         for(i=0; i<h; i+=2){\
 465             uint64_t a= LD64(pixels  );\
 466             uint64_t b= LD64(pixels+1);\
 467             l1=  (a&0x0303030303030303ULL)\
 468                + (b&0x0303030303030303ULL);\
 469             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 470               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 471             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 472             pixels+=line_size;\
 473             block +=line_size;\
 474             a= LD64(pixels  );\
 475             b= LD64(pixels+1);\
 476             l0=  (a&0x0303030303030303ULL)\
 477                + (b&0x0303030303030303ULL)\
 478                + 0x0101010101010101ULL;\
 479             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 480               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 481             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 482             pixels+=line_size;\
 483             block +=line_size;\
 484         }\
 485 }\
 486 \
 487 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 488 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 489 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 490 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 491 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 492 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 493 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 494
 495 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 496 #else // 64 bit variant
 497
 498 #define PIXOP2(OPNAME, OP) \
 499 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 500     int i;\
 501     for(i=0; i<h; i++){\
 502         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
 503         pixels+=line_size;\
 504         block +=line_size;\
 505     }\
 506 }\
 507 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 508     int i;\
 509     for(i=0; i<h; i++){\
 510         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 511         pixels+=line_size;\
 512         block +=line_size;\
 513     }\
 514 }\
 515 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 516     int i;\
 517     for(i=0; i<h; i++){\
 518         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 519         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 520         pixels+=line_size;\
 521         block +=line_size;\
 522     }\
 523 }\
 524 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 525     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 526 }\
 527 \
 528 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 529                                                 int src_stride1, int src_stride2, int h){\
 530     int i;\
 531     for(i=0; i<h; i++){\
 532         uint32_t a,b;\
 533         a= LD32(&src1[i*src_stride1  ]);\
 534         b= LD32(&src2[i*src_stride2  ]);\
 535         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 536         a= LD32(&src1[i*src_stride1+4]);\
 537         b= LD32(&src2[i*src_stride2+4]);\
 538         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 539     }\
 540 }\
 541 \
 542 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 543                                                 int src_stride1, int src_stride2, int h){\
 544     int i;\
 545     for(i=0; i<h; i++){\
 546         uint32_t a,b;\
 547         a= LD32(&src1[i*src_stride1  ]);\
 548         b= LD32(&src2[i*src_stride2  ]);\
 549         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 550         a= LD32(&src1[i*src_stride1+4]);\
 551         b= LD32(&src2[i*src_stride2+4]);\
 552         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 553     }\
 554 }\
 555 \
 556 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 557                                                 int src_stride1, int src_stride2, int h){\
 558     int i;\
 559     for(i=0; i<h; i++){\
 560         uint32_t a,b;\
 561         a= LD32(&src1[i*src_stride1  ]);\
 562         b= LD32(&src2[i*src_stride2  ]);\
 563         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 564     }\
 565 }\
 566 \
 567 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 568                                                 int src_stride1, int src_stride2, int h){\
 569     int i;\
 570     for(i=0; i<h; i++){\
 571         uint32_t a,b;\
 572         a= LD16(&src1[i*src_stride1  ]);\
 573         b= LD16(&src2[i*src_stride2  ]);\
 574         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 575     }\
 576 }\
 577 \
 578 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 579                                                 int src_stride1, int src_stride2, int h){\
 580     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 581     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 582 }\
 583 \
 584 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 585                                                 int src_stride1, int src_stride2, int h){\
 586     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 587     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 588 }\
 589 \
 590 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 591     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 592 }\
 593 \
 594 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 595     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 596 }\
 597 \
 598 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 599     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 600 }\
 601 \
 602 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 603     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 604 }\
 605 \
 606 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 607                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 608     int i;\
 609     for(i=0; i<h; i++){\
 610         uint32_t a, b, c, d, l0, l1, h0, h1;\
 611         a= LD32(&src1[i*src_stride1]);\
 612         b= LD32(&src2[i*src_stride2]);\
 613         c= LD32(&src3[i*src_stride3]);\
 614         d= LD32(&src4[i*src_stride4]);\
 615         l0=  (a&0x03030303UL)\
 616            + (b&0x03030303UL)\
 617            + 0x02020202UL;\
 618         h0= ((a&0xFCFCFCFCUL)>>2)\
 619           + ((b&0xFCFCFCFCUL)>>2);\
 620         l1=  (c&0x03030303UL)\
 621            + (d&0x03030303UL);\
 622         h1= ((c&0xFCFCFCFCUL)>>2)\
 623           + ((d&0xFCFCFCFCUL)>>2);\
 624         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 625         a= LD32(&src1[i*src_stride1+4]);\
 626         b= LD32(&src2[i*src_stride2+4]);\
 627         c= LD32(&src3[i*src_stride3+4]);\
 628         d= LD32(&src4[i*src_stride4+4]);\
 629         l0=  (a&0x03030303UL)\
 630            + (b&0x03030303UL)\
 631            + 0x02020202UL;\
 632         h0= ((a&0xFCFCFCFCUL)>>2)\
 633           + ((b&0xFCFCFCFCUL)>>2);\
 634         l1=  (c&0x03030303UL)\
 635            + (d&0x03030303UL);\
 636         h1= ((c&0xFCFCFCFCUL)>>2)\
 637           + ((d&0xFCFCFCFCUL)>>2);\
 638         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 639     }\
 640 }\
 641 \
 642 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 643     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 644 }\
 645 \
 646 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 647     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 648 }\
 649 \
 650 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 651     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 652 }\
 653 \
 654 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 655     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 656 }\
 657 \
 658 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 659                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 660     int i;\
 661     for(i=0; i<h; i++){\
 662         uint32_t a, b, c, d, l0, l1, h0, h1;\
 663         a= LD32(&src1[i*src_stride1]);\
 664         b= LD32(&src2[i*src_stride2]);\
 665         c= LD32(&src3[i*src_stride3]);\
 666         d= LD32(&src4[i*src_stride4]);\
 667         l0=  (a&0x03030303UL)\
 668            + (b&0x03030303UL)\
 669            + 0x01010101UL;\
 670         h0= ((a&0xFCFCFCFCUL)>>2)\
 671           + ((b&0xFCFCFCFCUL)>>2);\
 672         l1=  (c&0x03030303UL)\
 673            + (d&0x03030303UL);\
 674         h1= ((c&0xFCFCFCFCUL)>>2)\
 675           + ((d&0xFCFCFCFCUL)>>2);\
 676         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 677         a= LD32(&src1[i*src_stride1+4]);\
 678         b= LD32(&src2[i*src_stride2+4]);\
 679         c= LD32(&src3[i*src_stride3+4]);\
 680         d= LD32(&src4[i*src_stride4+4]);\
 681         l0=  (a&0x03030303UL)\
 682            + (b&0x03030303UL)\
 683            + 0x01010101UL;\
 684         h0= ((a&0xFCFCFCFCUL)>>2)\
 685           + ((b&0xFCFCFCFCUL)>>2);\
 686         l1=  (c&0x03030303UL)\
 687            + (d&0x03030303UL);\
 688         h1= ((c&0xFCFCFCFCUL)>>2)\
 689           + ((d&0xFCFCFCFCUL)>>2);\
 690         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 691     }\
 692 }\
 693 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 694                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 695     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 696     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 697 }\
 698 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 699                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 700     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 701     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 702 }\
 703 \
 704 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 705 {\
 706         int i, a0, b0, a1, b1;\
 707         a0= pixels[0];\
 708         b0= pixels[1] + 2;\
 709         a0 += b0;\
 710         b0 += pixels[2];\
 711 \
 712         pixels+=line_size;\
 713         for(i=0; i<h; i+=2){\
 714             a1= pixels[0];\
 715             b1= pixels[1];\
 716             a1 += b1;\
 717             b1 += pixels[2];\
 718 \
 719             block[0]= (a1+a0)>>2; /* FIXME non put */\
 720             block[1]= (b1+b0)>>2;\
 721 \
 722             pixels+=line_size;\
 723             block +=line_size;\
 724 \
 725             a0= pixels[0];\
 726             b0= pixels[1] + 2;\
 727             a0 += b0;\
 728             b0 += pixels[2];\
 729 \
 730             block[0]= (a1+a0)>>2;\
 731             block[1]= (b1+b0)>>2;\
 732             pixels+=line_size;\
 733             block +=line_size;\
 734         }\
 735 }\
 736 \
 737 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 738 {\
 739         int i;\
 740         const uint32_t a= LD32(pixels  );\
 741         const uint32_t b= LD32(pixels+1);\
 742         uint32_t l0=  (a&0x03030303UL)\
 743                     + (b&0x03030303UL)\
 744                     + 0x02020202UL;\
 745         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 746                    + ((b&0xFCFCFCFCUL)>>2);\
 747         uint32_t l1,h1;\
 748 \
 749         pixels+=line_size;\
 750         for(i=0; i<h; i+=2){\
 751             uint32_t a= LD32(pixels  );\
 752             uint32_t b= LD32(pixels+1);\
 753             l1=  (a&0x03030303UL)\
 754                + (b&0x03030303UL);\
 755             h1= ((a&0xFCFCFCFCUL)>>2)\
 756               + ((b&0xFCFCFCFCUL)>>2);\
 757             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 758             pixels+=line_size;\
 759             block +=line_size;\
 760             a= LD32(pixels  );\
 761             b= LD32(pixels+1);\
 762             l0=  (a&0x03030303UL)\
 763                + (b&0x03030303UL)\
 764                + 0x02020202UL;\
 765             h0= ((a&0xFCFCFCFCUL)>>2)\
 766               + ((b&0xFCFCFCFCUL)>>2);\
 767             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 768             pixels+=line_size;\
 769             block +=line_size;\
 770         }\
 771 }\
 772 \
 773 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 774 {\
 775     int j;\
 776     for(j=0; j<2; j++){\
 777         int i;\
 778         const uint32_t a= LD32(pixels  );\
 779         const uint32_t b= LD32(pixels+1);\
 780         uint32_t l0=  (a&0x03030303UL)\
 781                     + (b&0x03030303UL)\
 782                     + 0x02020202UL;\
 783         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 784                    + ((b&0xFCFCFCFCUL)>>2);\
 785         uint32_t l1,h1;\
 786 \
 787         pixels+=line_size;\
 788         for(i=0; i<h; i+=2){\
 789             uint32_t a= LD32(pixels  );\
 790             uint32_t b= LD32(pixels+1);\
 791             l1=  (a&0x03030303UL)\
 792                + (b&0x03030303UL);\
 793             h1= ((a&0xFCFCFCFCUL)>>2)\
 794               + ((b&0xFCFCFCFCUL)>>2);\
 795             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 796             pixels+=line_size;\
 797             block +=line_size;\
 798             a= LD32(pixels  );\
 799             b= LD32(pixels+1);\
 800             l0=  (a&0x03030303UL)\
 801                + (b&0x03030303UL)\
 802                + 0x02020202UL;\
 803             h0= ((a&0xFCFCFCFCUL)>>2)\
 804               + ((b&0xFCFCFCFCUL)>>2);\
 805             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 806             pixels+=line_size;\
 807             block +=line_size;\
 808         }\
 809         pixels+=4-line_size*(h+1);\
 810         block +=4-line_size*h;\
 811     }\
 812 }\
 813 \
 814 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 815 {\
 816     int j;\
 817     for(j=0; j<2; j++){\
 818         int i;\
 819         const uint32_t a= LD32(pixels  );\
 820         const uint32_t b= LD32(pixels+1);\
 821         uint32_t l0=  (a&0x03030303UL)\
 822                     + (b&0x03030303UL)\
 823                     + 0x01010101UL;\
 824         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 825                    + ((b&0xFCFCFCFCUL)>>2);\
 826         uint32_t l1,h1;\
 827 \
 828         pixels+=line_size;\
 829         for(i=0; i<h; i+=2){\
 830             uint32_t a= LD32(pixels  );\
 831             uint32_t b= LD32(pixels+1);\
 832             l1=  (a&0x03030303UL)\
 833                + (b&0x03030303UL);\
 834             h1= ((a&0xFCFCFCFCUL)>>2)\
 835               + ((b&0xFCFCFCFCUL)>>2);\
 836             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 837             pixels+=line_size;\
 838             block +=line_size;\
 839             a= LD32(pixels  );\
 840             b= LD32(pixels+1);\
 841             l0=  (a&0x03030303UL)\
 842                + (b&0x03030303UL)\
 843                + 0x01010101UL;\
 844             h0= ((a&0xFCFCFCFCUL)>>2)\
 845               + ((b&0xFCFCFCFCUL)>>2);\
 846             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 847             pixels+=line_size;\
 848             block +=line_size;\
 849         }\
 850         pixels+=4-line_size*(h+1);\
 851         block +=4-line_size*h;\
 852     }\
 853 }\
 854 \
 855 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
 856 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
 857 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
 858 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
 859 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
 860 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
 861 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
 862 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
 863
 864 #define op_avg(a, b) a = rnd_avg32(a, b)
 865 #endif
 866 #define op_put(a, b) a = b
 867
 868 PIXOP2(avg, op_avg)
 869 PIXOP2(put, op_put)
 870 #undef op_avg
 871 #undef op_put
 872
 873 #define avg2(a,b) ((a+b+1)>>1)
 874 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 875
 876
 877 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 878 {
 879     const int A=(16-x16)*(16-y16);
 880     const int B=(   x16)*(16-y16);
 881     const int C=(16-x16)*(   y16);
 882     const int D=(   x16)*(   y16);
 883     int i;
 884
 885     for(i=0; i<h; i++)
 886     {
 887         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 888         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 889         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 890         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 891         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 892         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 893         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 894         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 895         dst+= stride;
 896         src+= stride;
 897     }
 898 }
 899
 900 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 901                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 902 {
 903     int y, vx, vy;
 904     const int s= 1<<shift;
 905
 906     width--;
 907     height--;
 908
 909     for(y=0; y<h; y++){
 910         int x;
 911
 912         vx= ox;
 913         vy= oy;
 914         for(x=0; x<8; x++){ //XXX FIXME optimize
 915             int src_x, src_y, frac_x, frac_y, index;
 916
 917             src_x= vx>>16;
 918             src_y= vy>>16;
 919             frac_x= src_x&(s-1);
 920             frac_y= src_y&(s-1);
 921             src_x>>=shift;
 922             src_y>>=shift;
 923
 924             if((unsigned)src_x < width){
 925                 if((unsigned)src_y < height){
 926                     index= src_x + src_y*stride;
 927                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 928                                            + src[index       +1]*   frac_x )*(s-frac_y)
 929                                         + (  src[index+stride  ]*(s-frac_x)
 930                                            + src[index+stride+1]*   frac_x )*   frac_y
 931                                         + r)>>(shift*2);
 932                 }else{
 933                     index= src_x + clip(src_y, 0, height)*stride;
 934                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 935                                           + src[index       +1]*   frac_x )*s
 936                                         + r)>>(shift*2);
 937                 }
 938             }else{
 939                 if((unsigned)src_y < height){
 940                     index= clip(src_x, 0, width) + src_y*stride;
 941                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 942                                            + src[index+stride  ]*   frac_y )*s
 943                                         + r)>>(shift*2);
 944                 }else{
 945                     index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
 946                     dst[y*stride + x]=    src[index         ];
 947                 }
 948             }
 949
 950             vx+= dxx;
 951             vy+= dyx;
 952         }
 953         ox += dxy;
 954         oy += dyy;
 955     }
 956 }
 957
 958 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 959     switch(width){
 960     case 2: put_pixels2_c (dst, src, stride, height); break;
 961     case 4: put_pixels4_c (dst, src, stride, height); break;
 962     case 8: put_pixels8_c (dst, src, stride, height); break;
 963     case 16:put_pixels16_c(dst, src, stride, height); break;
 964     }
 965 }
 966
 967 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 968     int i,j;
 969     for (i=0; i < height; i++) {
 970       for (j=0; j < width; j++) {
 971         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 972       }
 973       src += stride;
 974       dst += stride;
 975     }
 976 }
 977
 978 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 979     int i,j;
 980     for (i=0; i < height; i++) {
 981       for (j=0; j < width; j++) {
 982         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 983       }
 984       src += stride;
 985       dst += stride;
 986     }
 987 }
 988
 989 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 990     int i,j;
 991     for (i=0; i < height; i++) {
 992       for (j=0; j < width; j++) {
 993         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 994       }
 995       src += stride;
 996       dst += stride;
 997     }
 998 }
 999
1000 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1001     int i,j;
1002     for (i=0; i < height; i++) {
1003       for (j=0; j < width; j++) {
1004         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1005       }
1006       src += stride;
1007       dst += stride;
1008     }
1009 }
1010
1011 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1012     int i,j;
1013     for (i=0; i < height; i++) {
1014       for (j=0; j < width; j++) {
1015         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1016       }
1017       src += stride;
1018       dst += stride;
1019     }
1020 }
1021
1022 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1023     int i,j;
1024     for (i=0; i < height; i++) {
1025       for (j=0; j < width; j++) {
1026         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1027       }
1028       src += stride;
1029       dst += stride;
1030     }
1031 }
1032
1033 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1034     int i,j;
1035     for (i=0; i < height; i++) {
1036       for (j=0; j < width; j++) {
1037         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1038       }
1039       src += stride;
1040       dst += stride;
1041     }
1042 }
1043
1044 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1045     int i,j;
1046     for (i=0; i < height; i++) {
1047       for (j=0; j < width; j++) {
1048         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1049       }
1050       src += stride;
1051       dst += stride;
1052     }
1053 }
1054
1055 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1056     switch(width){
1057     case 2: avg_pixels2_c (dst, src, stride, height); break;
1058     case 4: avg_pixels4_c (dst, src, stride, height); break;
1059     case 8: avg_pixels8_c (dst, src, stride, height); break;
1060     case 16:avg_pixels16_c(dst, src, stride, height); break;
1061     }
1062 }
1063
1064 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1065     int i,j;
1066     for (i=0; i < height; i++) {
1067       for (j=0; j < width; j++) {
1068         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1069       }
1070       src += stride;
1071       dst += stride;
1072     }
1073 }
1074
1075 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1076     int i,j;
1077     for (i=0; i < height; i++) {
1078       for (j=0; j < width; j++) {
1079         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1080       }
1081       src += stride;
1082       dst += stride;
1083     }
1084 }
1085
1086 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1087     int i,j;
1088     for (i=0; i < height; i++) {
1089       for (j=0; j < width; j++) {
1090         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1091       }
1092       src += stride;
1093       dst += stride;
1094     }
1095 }
1096
1097 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1098     int i,j;
1099     for (i=0; i < height; i++) {
1100       for (j=0; j < width; j++) {
1101         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1102       }
1103       src += stride;
1104       dst += stride;
1105     }
1106 }
1107
1108 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1109     int i,j;
1110     for (i=0; i < height; i++) {
1111       for (j=0; j < width; j++) {
1112         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1113       }
1114       src += stride;
1115       dst += stride;
1116     }
1117 }
1118
1119 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1120     int i,j;
1121     for (i=0; i < height; i++) {
1122       for (j=0; j < width; j++) {
1123         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1124       }
1125       src += stride;
1126       dst += stride;
1127     }
1128 }
1129
1130 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1131     int i,j;
1132     for (i=0; i < height; i++) {
1133       for (j=0; j < width; j++) {
1134         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1135       }
1136       src += stride;
1137       dst += stride;
1138     }
1139 }
1140
1141 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1142     int i,j;
1143     for (i=0; i < height; i++) {
1144       for (j=0; j < width; j++) {
1145         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1146       }
1147       src += stride;
1148       dst += stride;
1149     }
1150 }
1151 #if 0
1152 #define TPEL_WIDTH(width)\
1153 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1154     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1155 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1156     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1157 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1158     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1159 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1160     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1161 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1162     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1163 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1164     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1165 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1166     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1167 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1168     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1169 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1170     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1171 #endif
1172
1173 #define H264_CHROMA_MC(OPNAME, OP)\
1174 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1175     const int A=(8-x)*(8-y);\
1176     const int B=(  x)*(8-y);\
1177     const int C=(8-x)*(  y);\
1178     const int D=(  x)*(  y);\
1179     int i;\
1180     \
1181     assert(x<8 && y<8 && x>=0 && y>=0);\
1182 \
1183     for(i=0; i<h; i++)\
1184     {\
1185         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1186         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1187         dst+= stride;\
1188         src+= stride;\
1189     }\
1190 }\
1191 \
1192 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1193     const int A=(8-x)*(8-y);\
1194     const int B=(  x)*(8-y);\
1195     const int C=(8-x)*(  y);\
1196     const int D=(  x)*(  y);\
1197     int i;\
1198     \
1199     assert(x<8 && y<8 && x>=0 && y>=0);\
1200 \
1201     for(i=0; i<h; i++)\
1202     {\
1203         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1204         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1205         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1206         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1207         dst+= stride;\
1208         src+= stride;\
1209     }\
1210 }\
1211 \
1212 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1213     const int A=(8-x)*(8-y);\
1214     const int B=(  x)*(8-y);\
1215     const int C=(8-x)*(  y);\
1216     const int D=(  x)*(  y);\
1217     int i;\
1218     \
1219     assert(x<8 && y<8 && x>=0 && y>=0);\
1220 \
1221     for(i=0; i<h; i++)\
1222     {\
1223         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1224         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1225         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1226         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1227         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1228         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1229         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1230         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1231         dst+= stride;\
1232         src+= stride;\
1233     }\
1234 }
1235
1236 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1237 #define op_put(a, b) a = (((b) + 32)>>6)
1238
1239 H264_CHROMA_MC(put_       , op_put)
1240 H264_CHROMA_MC(avg_       , op_avg)
1241 #undef op_avg
1242 #undef op_put
1243
1244 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1245 {
1246     int i;
1247     for(i=0; i<h; i++)
1248     {
1249         ST32(dst   , LD32(src   ));
1250         dst+=dstStride;
1251         src+=srcStride;
1252     }
1253 }
1254
1255 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1256 {
1257     int i;
1258     for(i=0; i<h; i++)
1259     {
1260         ST32(dst   , LD32(src   ));
1261         ST32(dst+4 , LD32(src+4 ));
1262         dst+=dstStride;
1263         src+=srcStride;
1264     }
1265 }
1266
1267 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1268 {
1269     int i;
1270     for(i=0; i<h; i++)
1271     {
1272         ST32(dst   , LD32(src   ));
1273         ST32(dst+4 , LD32(src+4 ));
1274         ST32(dst+8 , LD32(src+8 ));
1275         ST32(dst+12, LD32(src+12));
1276         dst+=dstStride;
1277         src+=srcStride;
1278     }
1279 }
1280
1281 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1282 {
1283     int i;
1284     for(i=0; i<h; i++)
1285     {
1286         ST32(dst   , LD32(src   ));
1287         ST32(dst+4 , LD32(src+4 ));
1288         ST32(dst+8 , LD32(src+8 ));
1289         ST32(dst+12, LD32(src+12));
1290         dst[16]= src[16];
1291         dst+=dstStride;
1292         src+=srcStride;
1293     }
1294 }
1295
1296 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1297 {
1298     int i;
1299     for(i=0; i<h; i++)
1300     {
1301         ST32(dst   , LD32(src   ));
1302         ST32(dst+4 , LD32(src+4 ));
1303         dst[8]= src[8];
1304         dst+=dstStride;
1305         src+=srcStride;
1306     }
1307 }
1308
1309
1310 #define QPEL_MC(r, OPNAME, RND, OP) \
1311 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1312     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1313     int i;\
1314     for(i=0; i<h; i++)\
1315     {\
1316         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1317         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1318         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1319         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1320         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1321         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1322         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1323         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1324         dst+=dstStride;\
1325         src+=srcStride;\
1326     }\
1327 }\
1328 \
1329 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1330     const int w=8;\
1331     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1332     int i;\
1333     for(i=0; i<w; i++)\
1334     {\
1335         const int src0= src[0*srcStride];\
1336         const int src1= src[1*srcStride];\
1337         const int src2= src[2*srcStride];\
1338         const int src3= src[3*srcStride];\
1339         const int src4= src[4*srcStride];\
1340         const int src5= src[5*srcStride];\
1341         const int src6= src[6*srcStride];\
1342         const int src7= src[7*srcStride];\
1343         const int src8= src[8*srcStride];\
1344         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1345         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1346         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1347         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1348         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1349         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1350         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1351         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1352         dst++;\
1353         src++;\
1354     }\
1355 }\
1356 \
1357 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1358     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1359     int i;\
1360     \
1361     for(i=0; i<h; i++)\
1362     {\
1363         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1364         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1365         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1366         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1367         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1368         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1369         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1370         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1371         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1372         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1373         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1374         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1375         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1376         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1377         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1378         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1379         dst+=dstStride;\
1380         src+=srcStride;\
1381     }\
1382 }\
1383 \
1384 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1385     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1386     int i;\
1387     const int w=16;\
1388     for(i=0; i<w; i++)\
1389     {\
1390         const int src0= src[0*srcStride];\
1391         const int src1= src[1*srcStride];\
1392         const int src2= src[2*srcStride];\
1393         const int src3= src[3*srcStride];\
1394         const int src4= src[4*srcStride];\
1395         const int src5= src[5*srcStride];\
1396         const int src6= src[6*srcStride];\
1397         const int src7= src[7*srcStride];\
1398         const int src8= src[8*srcStride];\
1399         const int src9= src[9*srcStride];\
1400         const int src10= src[10*srcStride];\
1401         const int src11= src[11*srcStride];\
1402         const int src12= src[12*srcStride];\
1403         const int src13= src[13*srcStride];\
1404         const int src14= src[14*srcStride];\
1405         const int src15= src[15*srcStride];\
1406         const int src16= src[16*srcStride];\
1407         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1408         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1409         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1410         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1411         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1412         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1413         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1414         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1415         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1416         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1417         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1418         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1419         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1420         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1421         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1422         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1423         dst++;\
1424         src++;\
1425     }\
1426 }\
1427 \
1428 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1429     OPNAME ## pixels8_c(dst, src, stride, 8);\
1430 }\
1431 \
1432 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1433     uint8_t half[64];\
1434     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1435     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1436 }\
1437 \
1438 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1439     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1440 }\
1441 \
1442 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1443     uint8_t half[64];\
1444     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1445     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1446 }\
1447 \
1448 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1449     uint8_t full[16*9];\
1450     uint8_t half[64];\
1451     copy_block9(full, src, 16, stride, 9);\
1452     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1453     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1454 }\
1455 \
1456 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1457     uint8_t full[16*9];\
1458     copy_block9(full, src, 16, stride, 9);\
1459     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1460 }\
1461 \
1462 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1463     uint8_t full[16*9];\
1464     uint8_t half[64];\
1465     copy_block9(full, src, 16, stride, 9);\
1466     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1467     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1468 }\
1469 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1470     uint8_t full[16*9];\
1471     uint8_t halfH[72];\
1472     uint8_t halfV[64];\
1473     uint8_t halfHV[64];\
1474     copy_block9(full, src, 16, stride, 9);\
1475     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1476     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1477     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1478     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1479 }\
1480 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1481     uint8_t full[16*9];\
1482     uint8_t halfH[72];\
1483     uint8_t halfHV[64];\
1484     copy_block9(full, src, 16, stride, 9);\
1485     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1486     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1487     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1488     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1489 }\
1490 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1491     uint8_t full[16*9];\
1492     uint8_t halfH[72];\
1493     uint8_t halfV[64];\
1494     uint8_t halfHV[64];\
1495     copy_block9(full, src, 16, stride, 9);\
1496     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1497     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1498     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1499     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1500 }\
1501 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1502     uint8_t full[16*9];\
1503     uint8_t halfH[72];\
1504     uint8_t halfHV[64];\
1505     copy_block9(full, src, 16, stride, 9);\
1506     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1507     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1508     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1509     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1510 }\
1511 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1512     uint8_t full[16*9];\
1513     uint8_t halfH[72];\
1514     uint8_t halfV[64];\
1515     uint8_t halfHV[64];\
1516     copy_block9(full, src, 16, stride, 9);\
1517     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1518     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1519     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1520     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1521 }\
1522 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1523     uint8_t full[16*9];\
1524     uint8_t halfH[72];\
1525     uint8_t halfHV[64];\
1526     copy_block9(full, src, 16, stride, 9);\
1527     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1528     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1529     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1530     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1531 }\
1532 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1533     uint8_t full[16*9];\
1534     uint8_t halfH[72];\
1535     uint8_t halfV[64];\
1536     uint8_t halfHV[64];\
1537     copy_block9(full, src, 16, stride, 9);\
1538     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1539     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1540     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1541     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1542 }\
1543 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1544     uint8_t full[16*9];\
1545     uint8_t halfH[72];\
1546     uint8_t halfHV[64];\
1547     copy_block9(full, src, 16, stride, 9);\
1548     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1549     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1550     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1551     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1552 }\
1553 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1554     uint8_t halfH[72];\
1555     uint8_t halfHV[64];\
1556     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1557     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1558     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1559 }\
1560 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1561     uint8_t halfH[72];\
1562     uint8_t halfHV[64];\
1563     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1564     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1565     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1566 }\
1567 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1568     uint8_t full[16*9];\
1569     uint8_t halfH[72];\
1570     uint8_t halfV[64];\
1571     uint8_t halfHV[64];\
1572     copy_block9(full, src, 16, stride, 9);\
1573     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1574     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1575     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1576     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1577 }\
1578 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1579     uint8_t full[16*9];\
1580     uint8_t halfH[72];\
1581     copy_block9(full, src, 16, stride, 9);\
1582     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1583     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1584     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1585 }\
1586 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1587     uint8_t full[16*9];\
1588     uint8_t halfH[72];\
1589     uint8_t halfV[64];\
1590     uint8_t halfHV[64];\
1591     copy_block9(full, src, 16, stride, 9);\
1592     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1593     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1594     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1595     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1596 }\
1597 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1598     uint8_t full[16*9];\
1599     uint8_t halfH[72];\
1600     copy_block9(full, src, 16, stride, 9);\
1601     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1602     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1603     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1604 }\
1605 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1606     uint8_t halfH[72];\
1607     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1608     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1609 }\
1610 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1611     OPNAME ## pixels16_c(dst, src, stride, 16);\
1612 }\
1613 \
1614 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1615     uint8_t half[256];\
1616     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1617     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1618 }\
1619 \
1620 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1621     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1622 }\
1623 \
1624 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1625     uint8_t half[256];\
1626     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1627     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1628 }\
1629 \
1630 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1631     uint8_t full[24*17];\
1632     uint8_t half[256];\
1633     copy_block17(full, src, 24, stride, 17);\
1634     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1635     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1636 }\
1637 \
1638 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1639     uint8_t full[24*17];\
1640     copy_block17(full, src, 24, stride, 17);\
1641     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1642 }\
1643 \
1644 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1645     uint8_t full[24*17];\
1646     uint8_t half[256];\
1647     copy_block17(full, src, 24, stride, 17);\
1648     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1649     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1650 }\
1651 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1652     uint8_t full[24*17];\
1653     uint8_t halfH[272];\
1654     uint8_t halfV[256];\
1655     uint8_t halfHV[256];\
1656     copy_block17(full, src, 24, stride, 17);\
1657     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1658     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1659     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1660     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1661 }\
1662 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1663     uint8_t full[24*17];\
1664     uint8_t halfH[272];\
1665     uint8_t halfHV[256];\
1666     copy_block17(full, src, 24, stride, 17);\
1667     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1668     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1669     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1670     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1671 }\
1672 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1673     uint8_t full[24*17];\
1674     uint8_t halfH[272];\
1675     uint8_t halfV[256];\
1676     uint8_t halfHV[256];\
1677     copy_block17(full, src, 24, stride, 17);\
1678     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1679     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1680     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1681     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1682 }\
1683 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1684     uint8_t full[24*17];\
1685     uint8_t halfH[272];\
1686     uint8_t halfHV[256];\
1687     copy_block17(full, src, 24, stride, 17);\
1688     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1689     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1690     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1691     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1692 }\
1693 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1694     uint8_t full[24*17];\
1695     uint8_t halfH[272];\
1696     uint8_t halfV[256];\
1697     uint8_t halfHV[256];\
1698     copy_block17(full, src, 24, stride, 17);\
1699     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1700     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1701     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1702     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1703 }\
1704 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1705     uint8_t full[24*17];\
1706     uint8_t halfH[272];\
1707     uint8_t halfHV[256];\
1708     copy_block17(full, src, 24, stride, 17);\
1709     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1710     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1711     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1712     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1713 }\
1714 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1715     uint8_t full[24*17];\
1716     uint8_t halfH[272];\
1717     uint8_t halfV[256];\
1718     uint8_t halfHV[256];\
1719     copy_block17(full, src, 24, stride, 17);\
1720     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1721     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1722     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1723     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1724 }\
1725 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1726     uint8_t full[24*17];\
1727     uint8_t halfH[272];\
1728     uint8_t halfHV[256];\
1729     copy_block17(full, src, 24, stride, 17);\
1730     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1731     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1732     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1733     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1734 }\
1735 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1736     uint8_t halfH[272];\
1737     uint8_t halfHV[256];\
1738     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1739     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1740     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1741 }\
1742 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1743     uint8_t halfH[272];\
1744     uint8_t halfHV[256];\
1745     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1746     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1747     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1748 }\
1749 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1750     uint8_t full[24*17];\
1751     uint8_t halfH[272];\
1752     uint8_t halfV[256];\
1753     uint8_t halfHV[256];\
1754     copy_block17(full, src, 24, stride, 17);\
1755     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1756     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1757     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1758     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1759 }\
1760 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1761     uint8_t full[24*17];\
1762     uint8_t halfH[272];\
1763     copy_block17(full, src, 24, stride, 17);\
1764     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1765     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1766     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1767 }\
1768 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1769     uint8_t full[24*17];\
1770     uint8_t halfH[272];\
1771     uint8_t halfV[256];\
1772     uint8_t halfHV[256];\
1773     copy_block17(full, src, 24, stride, 17);\
1774     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1775     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1776     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1777     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1778 }\
1779 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1780     uint8_t full[24*17];\
1781     uint8_t halfH[272];\
1782     copy_block17(full, src, 24, stride, 17);\
1783     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1784     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1785     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1786 }\
1787 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1788     uint8_t halfH[272];\
1789     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1790     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1791 }
1792
1793 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1794 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1795 #define op_put(a, b) a = cm[((b) + 16)>>5]
1796 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1797
1798 QPEL_MC(0, put_       , _       , op_put)
1799 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1800 QPEL_MC(0, avg_       , _       , op_avg)
1801 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1802 #undef op_avg
1803 #undef op_avg_no_rnd
1804 #undef op_put
1805 #undef op_put_no_rnd
1806
1807 #if 1
1808 #define H264_LOWPASS(OPNAME, OP, OP2) \
1809 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1810     const int h=4;\
1811     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1812     int i;\
1813     for(i=0; i<h; i++)\
1814     {\
1815         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1816         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1817         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1818         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1819         dst+=dstStride;\
1820         src+=srcStride;\
1821     }\
1822 }\
1823 \
1824 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1825     const int w=4;\
1826     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1827     int i;\
1828     for(i=0; i<w; i++)\
1829     {\
1830         const int srcB= src[-2*srcStride];\
1831         const int srcA= src[-1*srcStride];\
1832         const int src0= src[0 *srcStride];\
1833         const int src1= src[1 *srcStride];\
1834         const int src2= src[2 *srcStride];\
1835         const int src3= src[3 *srcStride];\
1836         const int src4= src[4 *srcStride];\
1837         const int src5= src[5 *srcStride];\
1838         const int src6= src[6 *srcStride];\
1839         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1840         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1841         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1842         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1843         dst++;\
1844         src++;\
1845     }\
1846 }\
1847 \
1848 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1849     const int h=4;\
1850     const int w=4;\
1851     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1852     int i;\
1853     src -= 2*srcStride;\
1854     for(i=0; i<h+5; i++)\
1855     {\
1856         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1857         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1858         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1859         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1860         tmp+=tmpStride;\
1861         src+=srcStride;\
1862     }\
1863     tmp -= tmpStride*(h+5-2);\
1864     for(i=0; i<w; i++)\
1865     {\
1866         const int tmpB= tmp[-2*tmpStride];\
1867         const int tmpA= tmp[-1*tmpStride];\
1868         const int tmp0= tmp[0 *tmpStride];\
1869         const int tmp1= tmp[1 *tmpStride];\
1870         const int tmp2= tmp[2 *tmpStride];\
1871         const int tmp3= tmp[3 *tmpStride];\
1872         const int tmp4= tmp[4 *tmpStride];\
1873         const int tmp5= tmp[5 *tmpStride];\
1874         const int tmp6= tmp[6 *tmpStride];\
1875         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1876         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1877         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1878         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1879         dst++;\
1880         tmp++;\
1881     }\
1882 }\
1883 \
1884 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1885     const int h=8;\
1886     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1887     int i;\
1888     for(i=0; i<h; i++)\
1889     {\
1890         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1891         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1892         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1893         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1894         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1895         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1896         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1897         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1898         dst+=dstStride;\
1899         src+=srcStride;\
1900     }\
1901 }\
1902 \
1903 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1904     const int w=8;\
1905     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1906     int i;\
1907     for(i=0; i<w; i++)\
1908     {\
1909         const int srcB= src[-2*srcStride];\
1910         const int srcA= src[-1*srcStride];\
1911         const int src0= src[0 *srcStride];\
1912         const int src1= src[1 *srcStride];\
1913         const int src2= src[2 *srcStride];\
1914         const int src3= src[3 *srcStride];\
1915         const int src4= src[4 *srcStride];\
1916         const int src5= src[5 *srcStride];\
1917         const int src6= src[6 *srcStride];\
1918         const int src7= src[7 *srcStride];\
1919         const int src8= src[8 *srcStride];\
1920         const int src9= src[9 *srcStride];\
1921         const int src10=src[10*srcStride];\
1922         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1923         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1924         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1925         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1926         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1927         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1928         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1929         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1930         dst++;\
1931         src++;\
1932     }\
1933 }\
1934 \
1935 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1936     const int h=8;\
1937     const int w=8;\
1938     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1939     int i;\
1940     src -= 2*srcStride;\
1941     for(i=0; i<h+5; i++)\
1942     {\
1943         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1944         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1945         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1946         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1947         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1948         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1949         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1950         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1951         tmp+=tmpStride;\
1952         src+=srcStride;\
1953     }\
1954     tmp -= tmpStride*(h+5-2);\
1955     for(i=0; i<w; i++)\
1956     {\
1957         const int tmpB= tmp[-2*tmpStride];\
1958         const int tmpA= tmp[-1*tmpStride];\
1959         const int tmp0= tmp[0 *tmpStride];\
1960         const int tmp1= tmp[1 *tmpStride];\
1961         const int tmp2= tmp[2 *tmpStride];\
1962         const int tmp3= tmp[3 *tmpStride];\
1963         const int tmp4= tmp[4 *tmpStride];\
1964         const int tmp5= tmp[5 *tmpStride];\
1965         const int tmp6= tmp[6 *tmpStride];\
1966         const int tmp7= tmp[7 *tmpStride];\
1967         const int tmp8= tmp[8 *tmpStride];\
1968         const int tmp9= tmp[9 *tmpStride];\
1969         const int tmp10=tmp[10*tmpStride];\
1970         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1971         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1972         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1973         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1974         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1975         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1976         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1977         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1978         dst++;\
1979         tmp++;\
1980     }\
1981 }\
1982 \
1983 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1984     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1985     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1986     src += 8*srcStride;\
1987     dst += 8*dstStride;\
1988     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1989     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1990 }\
1991 \
1992 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1993     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1994     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1995     src += 8*srcStride;\
1996     dst += 8*dstStride;\
1997     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1998     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1999 }\
2000 \
2001 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2002     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2003     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2004     src += 8*srcStride;\
2005     tmp += 8*tmpStride;\
2006     dst += 8*dstStride;\
2007     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2008     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2009 }\
2010
2011 #define H264_MC(OPNAME, SIZE) \
2012 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2013     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2014 }\
2015 \
2016 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2017     uint8_t half[SIZE*SIZE];\
2018     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2019     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2020 }\
2021 \
2022 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2023     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2024 }\
2025 \
2026 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2027     uint8_t half[SIZE*SIZE];\
2028     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2029     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2030 }\
2031 \
2032 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2033     uint8_t full[SIZE*(SIZE+5)];\
2034     uint8_t * const full_mid= full + SIZE*2;\
2035     uint8_t half[SIZE*SIZE];\
2036     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2037     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2038     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2039 }\
2040 \
2041 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2042     uint8_t full[SIZE*(SIZE+5)];\
2043     uint8_t * const full_mid= full + SIZE*2;\
2044     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2045     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2046 }\
2047 \
2048 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2049     uint8_t full[SIZE*(SIZE+5)];\
2050     uint8_t * const full_mid= full + SIZE*2;\
2051     uint8_t half[SIZE*SIZE];\
2052     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2053     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2054     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2055 }\
2056 \
2057 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2058     uint8_t full[SIZE*(SIZE+5)];\
2059     uint8_t * const full_mid= full + SIZE*2;\
2060     uint8_t halfH[SIZE*SIZE];\
2061     uint8_t halfV[SIZE*SIZE];\
2062     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2063     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2064     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2065     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2066 }\
2067 \
2068 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2069     uint8_t full[SIZE*(SIZE+5)];\
2070     uint8_t * const full_mid= full + SIZE*2;\
2071     uint8_t halfH[SIZE*SIZE];\
2072     uint8_t halfV[SIZE*SIZE];\
2073     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2074     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2075     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2076     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2077 }\
2078 \
2079 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2080     uint8_t full[SIZE*(SIZE+5)];\
2081     uint8_t * const full_mid= full + SIZE*2;\
2082     uint8_t halfH[SIZE*SIZE];\
2083     uint8_t halfV[SIZE*SIZE];\
2084     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2085     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2086     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2087     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2088 }\
2089 \
2090 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2091     uint8_t full[SIZE*(SIZE+5)];\
2092     uint8_t * const full_mid= full + SIZE*2;\
2093     uint8_t halfH[SIZE*SIZE];\
2094     uint8_t halfV[SIZE*SIZE];\
2095     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2096     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2097     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2098     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2099 }\
2100 \
2101 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2102     int16_t tmp[SIZE*(SIZE+5)];\
2103     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2104 }\
2105 \
2106 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2107     int16_t tmp[SIZE*(SIZE+5)];\
2108     uint8_t halfH[SIZE*SIZE];\
2109     uint8_t halfHV[SIZE*SIZE];\
2110     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2111     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2112     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2113 }\
2114 \
2115 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2116     int16_t tmp[SIZE*(SIZE+5)];\
2117     uint8_t halfH[SIZE*SIZE];\
2118     uint8_t halfHV[SIZE*SIZE];\
2119     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2120     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2121     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2122 }\
2123 \
2124 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2125     uint8_t full[SIZE*(SIZE+5)];\
2126     uint8_t * const full_mid= full + SIZE*2;\
2127     int16_t tmp[SIZE*(SIZE+5)];\
2128     uint8_t halfV[SIZE*SIZE];\
2129     uint8_t halfHV[SIZE*SIZE];\
2130     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2131     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2132     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2133     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2134 }\
2135 \
2136 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2137     uint8_t full[SIZE*(SIZE+5)];\
2138     uint8_t * const full_mid= full + SIZE*2;\
2139     int16_t tmp[SIZE*(SIZE+5)];\
2140     uint8_t halfV[SIZE*SIZE];\
2141     uint8_t halfHV[SIZE*SIZE];\
2142     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2143     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2144     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2145     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2146 }\
2147
2148 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2149 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2150 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2151 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2152 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2153
2154 H264_LOWPASS(put_       , op_put, op2_put)
2155 H264_LOWPASS(avg_       , op_avg, op2_avg)
2156 H264_MC(put_, 4)
2157 H264_MC(put_, 8)
2158 H264_MC(put_, 16)
2159 H264_MC(avg_, 4)
2160 H264_MC(avg_, 8)
2161 H264_MC(avg_, 16)
2162
2163 #undef op_avg
2164 #undef op_put
2165 #undef op2_avg
2166 #undef op2_put
2167 #endif
2168
2169 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2170     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2171     int i;
2172
2173     for(i=0; i<h; i++){
2174         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2175         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2176         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2177         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2178         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2179         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2180         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2181         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2182         dst+=dstStride;
2183         src+=srcStride;
2184     }
2185 }
2186
2187 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2188     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2189     int i;
2190
2191     for(i=0; i<w; i++){
2192         const int src_1= src[ -srcStride];
2193         const int src0 = src[0          ];
2194         const int src1 = src[  srcStride];
2195         const int src2 = src[2*srcStride];
2196         const int src3 = src[3*srcStride];
2197         const int src4 = src[4*srcStride];
2198         const int src5 = src[5*srcStride];
2199         const int src6 = src[6*srcStride];
2200         const int src7 = src[7*srcStride];
2201         const int src8 = src[8*srcStride];
2202         const int src9 = src[9*srcStride];
2203         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2204         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2205         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2206         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2207         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2208         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2209         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2210         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2211         src++;
2212         dst++;
2213     }
2214 }
2215
2216 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2217     put_pixels8_c(dst, src, stride, 8);
2218 }
2219
2220 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2221     uint8_t half[64];
2222     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2223     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2224 }
2225
2226 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2227     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2228 }
2229
2230 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2231     uint8_t half[64];
2232     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2233     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2234 }
2235
2236 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2237     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2238 }
2239
2240 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2241     uint8_t halfH[88];
2242     uint8_t halfV[64];
2243     uint8_t halfHV[64];
2244     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2245     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2246     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2247     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2248 }
2249 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2250     uint8_t halfH[88];
2251     uint8_t halfV[64];
2252     uint8_t halfHV[64];
2253     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2254     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2255     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2256     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2257 }
2258 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2259     uint8_t halfH[88];
2260     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2261     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2262 }
2263
2264
2265 static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2266 {
2267     int s, i;
2268
2269     s = 0;
2270     for(i=0;i<16;i++) {
2271         s += abs(pix1[0] - pix2[0]);
2272         s += abs(pix1[1] - pix2[1]);
2273         s += abs(pix1[2] - pix2[2]);
2274         s += abs(pix1[3] - pix2[3]);
2275         s += abs(pix1[4] - pix2[4]);
2276         s += abs(pix1[5] - pix2[5]);
2277         s += abs(pix1[6] - pix2[6]);
2278         s += abs(pix1[7] - pix2[7]);
2279         s += abs(pix1[8] - pix2[8]);
2280         s += abs(pix1[9] - pix2[9]);
2281         s += abs(pix1[10] - pix2[10]);
2282         s += abs(pix1[11] - pix2[11]);
2283         s += abs(pix1[12] - pix2[12]);
2284         s += abs(pix1[13] - pix2[13]);
2285         s += abs(pix1[14] - pix2[14]);
2286         s += abs(pix1[15] - pix2[15]);
2287         pix1 += line_size;
2288         pix2 += line_size;
2289     }
2290     return s;
2291 }
2292
2293 static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2294 {
2295     int s, i;
2296
2297     s = 0;
2298     for(i=0;i<16;i++) {
2299         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2300         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2301         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2302         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2303         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2304         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2305         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2306         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2307         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2308         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2309         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2310         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2311         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2312         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2313         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2314         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2315         pix1 += line_size;
2316         pix2 += line_size;
2317     }
2318     return s;
2319 }
2320
2321 static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2322 {
2323     int s, i;
2324     uint8_t *pix3 = pix2 + line_size;
2325
2326     s = 0;
2327     for(i=0;i<16;i++) {
2328         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2329         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2330         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2331         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2332         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2333         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2334         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2335         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2336         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2337         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2338         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2339         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2340         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2341         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2342         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2343         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2344         pix1 += line_size;
2345         pix2 += line_size;
2346         pix3 += line_size;
2347     }
2348     return s;
2349 }
2350
2351 static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2352 {
2353     int s, i;
2354     uint8_t *pix3 = pix2 + line_size;
2355
2356     s = 0;
2357     for(i=0;i<16;i++) {
2358         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2359         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2360         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2361         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2362         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2363         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2364         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2365         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2366         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2367         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2368         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2369         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2370         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2371         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2372         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2373         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2374         pix1 += line_size;
2375         pix2 += line_size;
2376         pix3 += line_size;
2377     }
2378     return s;
2379 }
2380
2381 static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2382 {
2383     int s, i;
2384
2385     s = 0;
2386     for(i=0;i<8;i++) {
2387         s += abs(pix1[0] - pix2[0]);
2388         s += abs(pix1[1] - pix2[1]);
2389         s += abs(pix1[2] - pix2[2]);
2390         s += abs(pix1[3] - pix2[3]);
2391         s += abs(pix1[4] - pix2[4]);
2392         s += abs(pix1[5] - pix2[5]);
2393         s += abs(pix1[6] - pix2[6]);
2394         s += abs(pix1[7] - pix2[7]);
2395         pix1 += line_size;
2396         pix2 += line_size;
2397     }
2398     return s;
2399 }
2400
2401 static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2402 {
2403     int s, i;
2404
2405     s = 0;
2406     for(i=0;i<8;i++) {
2407         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2408         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2409         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2410         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2411         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2412         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2413         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2414         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2415         pix1 += line_size;
2416         pix2 += line_size;
2417     }
2418     return s;
2419 }
2420
2421 static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2422 {
2423     int s, i;
2424     uint8_t *pix3 = pix2 + line_size;
2425
2426     s = 0;
2427     for(i=0;i<8;i++) {
2428         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2429         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2430         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2431         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2432         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2433         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2434         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2435         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2436         pix1 += line_size;
2437         pix2 += line_size;
2438         pix3 += line_size;
2439     }
2440     return s;
2441 }
2442
2443 static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2444 {
2445     int s, i;
2446     uint8_t *pix3 = pix2 + line_size;
2447
2448     s = 0;
2449     for(i=0;i<8;i++) {
2450         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2451         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2452         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2453         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2454         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2455         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2456         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2457         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2458         pix1 += line_size;
2459         pix2 += line_size;
2460         pix3 += line_size;
2461     }
2462     return s;
2463 }
2464
2465 static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
2466     return pix_abs16x16_c(a,b,stride);
2467 }
2468
2469 static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
2470     return pix_abs8x8_c(a,b,stride);
2471 }
2472
2473 /**
2474  * permutes an 8x8 block.
2475  * @param block the block which will be permuted according to the given permutation vector
2476  * @param permutation the permutation vector
2477  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2478  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2479  *                  (inverse) permutated to scantable order!
2480  */
2481 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2482 {
2483     int i;
2484     DCTELEM temp[64];
2485
2486     if(last<=0) return;
2487     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2488
2489     for(i=0; i<=last; i++){
2490         const int j= scantable[i];
2491         temp[j]= block[j];
2492         block[j]=0;
2493     }
2494
2495     for(i=0; i<=last; i++){
2496         const int j= scantable[i];
2497         const int perm_j= permutation[j];
2498         block[perm_j]= temp[j];
2499     }
2500 }
2501
2502 /**
2503  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2504  */
2505 static void clear_blocks_c(DCTELEM *blocks)
2506 {
2507     memset(blocks, 0, sizeof(DCTELEM)*6*64);
2508 }
2509
2510 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2511     int i;
2512     for(i=0; i+7<w; i+=8){
2513         dst[i+0] += src[i+0];
2514         dst[i+1] += src[i+1];
2515         dst[i+2] += src[i+2];
2516         dst[i+3] += src[i+3];
2517         dst[i+4] += src[i+4];
2518         dst[i+5] += src[i+5];
2519         dst[i+6] += src[i+6];
2520         dst[i+7] += src[i+7];
2521     }
2522     for(; i<w; i++)
2523         dst[i+0] += src[i+0];
2524 }
2525
2526 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2527     int i;
2528     for(i=0; i+7<w; i+=8){
2529         dst[i+0] = src1[i+0]-src2[i+0];
2530         dst[i+1] = src1[i+1]-src2[i+1];
2531         dst[i+2] = src1[i+2]-src2[i+2];
2532         dst[i+3] = src1[i+3]-src2[i+3];
2533         dst[i+4] = src1[i+4]-src2[i+4];
2534         dst[i+5] = src1[i+5]-src2[i+5];
2535         dst[i+6] = src1[i+6]-src2[i+6];
2536         dst[i+7] = src1[i+7]-src2[i+7];
2537     }
2538     for(; i<w; i++)
2539         dst[i+0] = src1[i+0]-src2[i+0];
2540 }
2541
2542 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2543     int i;
2544     uint8_t l, lt;
2545
2546     l= *left;
2547     lt= *left_top;
2548
2549     for(i=0; i<w; i++){
2550         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2551         lt= src1[i];
2552         l= src2[i];
2553         dst[i]= l - pred;
2554     }
2555
2556     *left= l;
2557     *left_top= lt;
2558 }
2559
2560 #define BUTTERFLY2(o1,o2,i1,i2) \
2561 o1= (i1)+(i2);\
2562 o2= (i1)-(i2);
2563
2564 #define BUTTERFLY1(x,y) \
2565 {\
2566     int a,b;\
2567     a= x;\
2568     b= y;\
2569     x= a+b;\
2570     y= a-b;\
2571 }
2572
2573 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2574
2575 static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
2576     int i;
2577     int temp[64];
2578     int sum=0;
2579
2580     for(i=0; i<8; i++){
2581         //FIXME try pointer walks
2582         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2583         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2584         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2585         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2586
2587         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2588         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2589         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2590         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2591
2592         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2593         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2594         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2595         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2596     }
2597
2598     for(i=0; i<8; i++){
2599         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2600         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2601         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2602         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2603
2604         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2605         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2606         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2607         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2608
2609         sum +=
2610              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2611             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2612             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2613             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2614     }
2615 #if 0
2616 static int maxi=0;
2617 if(sum>maxi){
2618     maxi=sum;
2619     printf("MAX:%d\n", maxi);
2620 }
2621 #endif
2622     return sum;
2623 }
2624
2625 static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
2626     int i;
2627     int temp[64];
2628     int sum=0;
2629 //FIXME OOOPS ignore 0 term instead of mean mess
2630     for(i=0; i<8; i++){
2631         //FIXME try pointer walks
2632         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
2633         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
2634         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
2635         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
2636
2637         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2638         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2639         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2640         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2641
2642         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2643         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2644         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2645         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2646     }
2647
2648     for(i=0; i<8; i++){
2649         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2650         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2651         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2652         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2653
2654         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2655         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2656         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2657         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2658
2659         sum +=
2660              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2661             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2662             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2663             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2664     }
2665
2666     return sum;
2667 }
2668
2669 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2670     MpegEncContext * const s= (MpegEncContext *)c;
2671     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2672     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2673     int sum=0, i;
2674
2675     s->dsp.diff_pixels(temp, src1, src2, stride);
2676     s->dsp.fdct(temp);
2677
2678     for(i=0; i<64; i++)
2679         sum+= ABS(temp[i]);
2680
2681     return sum;
2682 }
2683
2684 void simple_idct(DCTELEM *block); //FIXME
2685
2686 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2687     MpegEncContext * const s= (MpegEncContext *)c;
2688     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2689     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2690     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2691     int sum=0, i;
2692
2693     s->mb_intra=0;
2694
2695     s->dsp.diff_pixels(temp, src1, src2, stride);
2696
2697     memcpy(bak, temp, 64*sizeof(DCTELEM));
2698
2699     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2700     s->dct_unquantize(s, temp, 0, s->qscale);
2701     simple_idct(temp); //FIXME
2702
2703     for(i=0; i<64; i++)
2704         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2705
2706     return sum;
2707 }
2708
2709 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2710     MpegEncContext * const s= (MpegEncContext *)c;
2711     const uint8_t *scantable= s->intra_scantable.permutated;
2712     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2713     uint64_t __align8 aligned_bak[stride];
2714     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2715     uint8_t * const bak= (uint8_t*)aligned_bak;
2716     int i, last, run, bits, level, distoration, start_i;
2717     const int esc_length= s->ac_esc_length;
2718     uint8_t * length;
2719     uint8_t * last_length;
2720
2721     for(i=0; i<8; i++){
2722         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2723         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2724     }
2725
2726     s->dsp.diff_pixels(temp, src1, src2, stride);
2727
2728     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2729
2730     bits=0;
2731
2732     if (s->mb_intra) {
2733         start_i = 1;
2734         length     = s->intra_ac_vlc_length;
2735         last_length= s->intra_ac_vlc_last_length;
2736         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2737     } else {
2738         start_i = 0;
2739         length     = s->inter_ac_vlc_length;
2740         last_length= s->inter_ac_vlc_last_length;
2741     }
2742
2743     if(last>=start_i){
2744         run=0;
2745         for(i=start_i; i<last; i++){
2746             int j= scantable[i];
2747             level= temp[j];
2748
2749             if(level){
2750                 level+=64;
2751                 if((level&(~127)) == 0){
2752                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2753                 }else
2754                     bits+= esc_length;
2755                 run=0;
2756             }else
2757                 run++;
2758         }
2759         i= scantable[last];
2760
2761         level= temp[i] + 64;
2762
2763         assert(level - 64);
2764
2765         if((level&(~127)) == 0){
2766             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2767         }else
2768             bits+= esc_length;
2769
2770     }
2771
2772     if(last>=0){
2773         s->dct_unquantize(s, temp, 0, s->qscale);
2774     }
2775
2776     s->dsp.idct_add(bak, stride, temp);
2777
2778     distoration= s->dsp.sse[1](NULL, bak, src1, stride);
2779
2780     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2781 }
2782
2783 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2784     MpegEncContext * const s= (MpegEncContext *)c;
2785     const uint8_t *scantable= s->intra_scantable.permutated;
2786     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2787     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2788     int i, last, run, bits, level, start_i;
2789     const int esc_length= s->ac_esc_length;
2790     uint8_t * length;
2791     uint8_t * last_length;
2792
2793     s->dsp.diff_pixels(temp, src1, src2, stride);
2794
2795     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2796
2797     bits=0;
2798
2799     if (s->mb_intra) {
2800         start_i = 1;
2801         length     = s->intra_ac_vlc_length;
2802         last_length= s->intra_ac_vlc_last_length;
2803         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2804     } else {
2805         start_i = 0;
2806         length     = s->inter_ac_vlc_length;
2807         last_length= s->inter_ac_vlc_last_length;
2808     }
2809
2810     if(last>=start_i){
2811         run=0;
2812         for(i=start_i; i<last; i++){
2813             int j= scantable[i];
2814             level= temp[j];
2815
2816             if(level){
2817                 level+=64;
2818                 if((level&(~127)) == 0){
2819                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2820                 }else
2821                     bits+= esc_length;
2822                 run=0;
2823             }else
2824                 run++;
2825         }
2826         i= scantable[last];
2827
2828         level= temp[i] + 64;
2829
2830         assert(level - 64);
2831
2832         if((level&(~127)) == 0){
2833             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2834         }else
2835             bits+= esc_length;
2836     }
2837
2838     return bits;
2839 }
2840
2841
2842 WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
2843 WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
2844 WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
2845 WARPER88_1616(rd8x8_c, rd16x16_c)
2846 WARPER88_1616(bit8x8_c, bit16x16_c)
2847
2848 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2849  converted */
2850 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2851 {
2852     j_rev_dct (block);
2853     put_pixels_clamped_c(block, dest, line_size);
2854 }
2855 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2856 {
2857     j_rev_dct (block);
2858     add_pixels_clamped_c(block, dest, line_size);
2859 }
2860
2861 /* init static data */
2862 void dsputil_static_init(void)
2863 {
2864     int i;
2865
2866     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
2867     for(i=0;i<MAX_NEG_CROP;i++) {
2868         cropTbl[i] = 0;
2869         cropTbl[i + MAX_NEG_CROP + 256] = 255;
2870     }
2871
2872     for(i=0;i<512;i++) {
2873         squareTbl[i] = (i - 256) * (i - 256);
2874     }
2875
2876     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2877 }
2878
2879
2880 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2881 {
2882     int i;
2883
2884 #ifdef CONFIG_ENCODERS
2885     if(avctx->dct_algo==FF_DCT_FASTINT) {
2886         c->fdct = fdct_ifast;
2887         c->fdct248 = fdct_ifast248;
2888     }
2889     else if(avctx->dct_algo==FF_DCT_FAAN) {
2890         c->fdct = ff_faandct;
2891         c->fdct248 = ff_faandct248;
2892     }
2893     else {
2894         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2895         c->fdct248 = ff_fdct248_islow;
2896     }
2897 #endif //CONFIG_ENCODERS
2898
2899     if(avctx->idct_algo==FF_IDCT_INT){
2900         c->idct_put= ff_jref_idct_put;
2901         c->idct_add= ff_jref_idct_add;
2902         c->idct    = j_rev_dct;
2903         c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2904     }else{ //accurate/default
2905         c->idct_put= simple_idct_put;
2906         c->idct_add= simple_idct_add;
2907         c->idct    = simple_idct;
2908         c->idct_permutation_type= FF_NO_IDCT_PERM;
2909     }
2910
2911     c->get_pixels = get_pixels_c;
2912     c->diff_pixels = diff_pixels_c;
2913     c->put_pixels_clamped = put_pixels_clamped_c;
2914     c->add_pixels_clamped = add_pixels_clamped_c;
2915     c->gmc1 = gmc1_c;
2916     c->gmc = gmc_c;
2917     c->clear_blocks = clear_blocks_c;
2918     c->pix_sum = pix_sum_c;
2919     c->pix_norm1 = pix_norm1_c;
2920     c->sse[0]= sse16_c;
2921     c->sse[1]= sse8_c;
2922
2923     /* TODO [0] 16  [1] 8 */
2924     c->pix_abs16x16     = pix_abs16x16_c;
2925     c->pix_abs16x16_x2  = pix_abs16x16_x2_c;
2926     c->pix_abs16x16_y2  = pix_abs16x16_y2_c;
2927     c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
2928     c->pix_abs8x8     = pix_abs8x8_c;
2929     c->pix_abs8x8_x2  = pix_abs8x8_x2_c;
2930     c->pix_abs8x8_y2  = pix_abs8x8_y2_c;
2931     c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
2932
2933 #define dspfunc(PFX, IDX, NUM) \
2934     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
2935     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
2936     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
2937     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2938
2939     dspfunc(put, 0, 16);
2940     dspfunc(put_no_rnd, 0, 16);
2941     dspfunc(put, 1, 8);
2942     dspfunc(put_no_rnd, 1, 8);
2943     dspfunc(put, 2, 4);
2944     dspfunc(put, 3, 2);
2945
2946     dspfunc(avg, 0, 16);
2947     dspfunc(avg_no_rnd, 0, 16);
2948     dspfunc(avg, 1, 8);
2949     dspfunc(avg_no_rnd, 1, 8);
2950     dspfunc(avg, 2, 4);
2951     dspfunc(avg, 3, 2);
2952 #undef dspfunc
2953
2954     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2955     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2956     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2957     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2958     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2959     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2960     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2961     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2962     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2963
2964     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2965     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2966     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2967     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2968     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2969     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2970     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2971     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2972     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2973
2974 #define dspfunc(PFX, IDX, NUM) \
2975     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2976     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2977     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2978     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2979     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2980     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2981     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2982     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2983     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2984     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2985     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2986     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2987     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2988     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2989     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2990     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2991
2992     dspfunc(put_qpel, 0, 16);
2993     dspfunc(put_no_rnd_qpel, 0, 16);
2994
2995     dspfunc(avg_qpel, 0, 16);
2996     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2997
2998     dspfunc(put_qpel, 1, 8);
2999     dspfunc(put_no_rnd_qpel, 1, 8);
3000
3001     dspfunc(avg_qpel, 1, 8);
3002     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3003
3004     dspfunc(put_h264_qpel, 0, 16);
3005     dspfunc(put_h264_qpel, 1, 8);
3006     dspfunc(put_h264_qpel, 2, 4);
3007     dspfunc(avg_h264_qpel, 0, 16);
3008     dspfunc(avg_h264_qpel, 1, 8);
3009     dspfunc(avg_h264_qpel, 2, 4);
3010
3011 #undef dspfunc
3012     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3013     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3014     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3015     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3016     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3017     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3018
3019     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3020     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3021     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3022     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3023     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3024     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3025     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3026     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3027
3028     c->hadamard8_diff[0]= hadamard8_diff16_c;
3029     c->hadamard8_diff[1]= hadamard8_diff_c;
3030     c->hadamard8_abs = hadamard8_abs_c;
3031
3032     c->dct_sad[0]= dct_sad16x16_c;
3033     c->dct_sad[1]= dct_sad8x8_c;
3034
3035     c->sad[0]= sad16x16_c;
3036     c->sad[1]= sad8x8_c;
3037
3038     c->quant_psnr[0]= quant_psnr16x16_c;
3039     c->quant_psnr[1]= quant_psnr8x8_c;
3040
3041     c->rd[0]= rd16x16_c;
3042     c->rd[1]= rd8x8_c;
3043
3044     c->bit[0]= bit16x16_c;
3045     c->bit[1]= bit8x8_c;
3046
3047     c->add_bytes= add_bytes_c;
3048     c->diff_bytes= diff_bytes_c;
3049     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3050     c->bswap_buf= bswap_buf;
3051
3052 #ifdef HAVE_MMX
3053     dsputil_init_mmx(c, avctx);
3054 #endif
3055 #ifdef ARCH_ARMV4L
3056     dsputil_init_armv4l(c, avctx);
3057 #endif
3058 #ifdef HAVE_MLIB
3059     dsputil_init_mlib(c, avctx);
3060 #endif
3061 #ifdef ARCH_ALPHA
3062     dsputil_init_alpha(c, avctx);
3063 #endif
3064 #ifdef ARCH_POWERPC
3065     dsputil_init_ppc(c, avctx);
3066 #endif
3067 #ifdef HAVE_MMI
3068     dsputil_init_mmi(c, avctx);
3069 #endif
3070 #ifdef ARCH_SH4
3071     dsputil_init_sh4(c,avctx);
3072 #endif
3073
3074     switch(c->idct_permutation_type){
3075     case FF_NO_IDCT_PERM:
3076         for(i=0; i<64; i++)
3077             c->idct_permutation[i]= i;
3078         break;
3079     case FF_LIBMPEG2_IDCT_PERM:
3080         for(i=0; i<64; i++)
3081             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3082         break;
3083     case FF_SIMPLE_IDCT_PERM:
3084         for(i=0; i<64; i++)
3085             c->idct_permutation[i]= simple_mmx_permutation[i];
3086         break;
3087     case FF_TRANSPOSE_IDCT_PERM:
3088         for(i=0; i<64; i++)
3089             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3090         break;
3091     default:
3092         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3093     }
3094 }
3095