git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  *
  20  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  21  */
  22
  23 /**
  24  * @file dsputil.c
  25  * DSP utils
  26  */
  27
  28 #include "avcodec.h"
  29 #include "dsputil.h"
  30 #include "mpegvideo.h"
  31 #include "simple_idct.h"
  32 #include "faandct.h"
  33
  34 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
  35 uint32_t squareTbl[512];
  36
  37 const uint8_t ff_zigzag_direct[64] = {
  38     0,   1,  8, 16,  9,  2,  3, 10,
  39     17, 24, 32, 25, 18, 11,  4,  5,
  40     12, 19, 26, 33, 40, 48, 41, 34,
  41     27, 20, 13,  6,  7, 14, 21, 28,
  42     35, 42, 49, 56, 57, 50, 43, 36,
  43     29, 22, 15, 23, 30, 37, 44, 51,
  44     58, 59, 52, 45, 38, 31, 39, 46,
  45     53, 60, 61, 54, 47, 55, 62, 63
  46 };
  47
  48 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  49    specification, we interleave the fields */
  50 const uint8_t ff_zigzag248_direct[64] = {
  51      0,  8,  1,  9, 16, 24,  2, 10,
  52     17, 25, 32, 40, 48, 56, 33, 41,
  53     18, 26,  3, 11,  4, 12, 19, 27,
  54     34, 42, 49, 57, 50, 58, 35, 43,
  55     20, 28,  5, 13,  6, 14, 21, 29,
  56     36, 44, 51, 59, 52, 60, 37, 45,
  57     22, 30,  7, 15, 23, 31, 38, 46,
  58     53, 61, 54, 62, 39, 47, 55, 63,
  59 };
  60
  61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  62 uint16_t __align8 inv_zigzag_direct16[64];
  63
  64 const uint8_t ff_alternate_horizontal_scan[64] = {
  65     0,  1,   2,  3,  8,  9, 16, 17,
  66     10, 11,  4,  5,  6,  7, 15, 14,
  67     13, 12, 19, 18, 24, 25, 32, 33,
  68     26, 27, 20, 21, 22, 23, 28, 29,
  69     30, 31, 34, 35, 40, 41, 48, 49,
  70     42, 43, 36, 37, 38, 39, 44, 45,
  71     46, 47, 50, 51, 56, 57, 58, 59,
  72     52, 53, 54, 55, 60, 61, 62, 63,
  73 };
  74
  75 const uint8_t ff_alternate_vertical_scan[64] = {
  76     0,  8,  16, 24,  1,  9,  2, 10,
  77     17, 25, 32, 40, 48, 56, 57, 49,
  78     41, 33, 26, 18,  3, 11,  4, 12,
  79     19, 27, 34, 42, 50, 58, 35, 43,
  80     51, 59, 20, 28,  5, 13,  6, 14,
  81     21, 29, 36, 44, 52, 60, 37, 45,
  82     53, 61, 22, 30,  7, 15, 23, 31,
  83     38, 46, 54, 62, 39, 47, 55, 63,
  84 };
  85
  86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  87 const uint32_t inverse[256]={
  88          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  89  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
  90  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
  91  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
  92  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
  93  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
  94   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
  95   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
  96   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
  97   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
  98   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
  99   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 100   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 101   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 102   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 103   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 104   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 105   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 106   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 107   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 108   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 109   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 110   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 111   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 112   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 113   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 114   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 115   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 116   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 117   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 118   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 119   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 120 };
 121
 122 /* Input permutation for the simple_idct_mmx */
 123 static const uint8_t simple_mmx_permutation[64]={
 124         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 125         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 126         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 127         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 128         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 129         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 130         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 131         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 132 };
 133
 134 static int pix_sum_c(uint8_t * pix, int line_size)
 135 {
 136     int s, i, j;
 137
 138     s = 0;
 139     for (i = 0; i < 16; i++) {
 140         for (j = 0; j < 16; j += 8) {
 141             s += pix[0];
 142             s += pix[1];
 143             s += pix[2];
 144             s += pix[3];
 145             s += pix[4];
 146             s += pix[5];
 147             s += pix[6];
 148             s += pix[7];
 149             pix += 8;
 150         }
 151         pix += line_size - 16;
 152     }
 153     return s;
 154 }
 155
 156 static int pix_norm1_c(uint8_t * pix, int line_size)
 157 {
 158     int s, i, j;
 159     uint32_t *sq = squareTbl + 256;
 160
 161     s = 0;
 162     for (i = 0; i < 16; i++) {
 163         for (j = 0; j < 16; j += 8) {
 164 #if 0
 165             s += sq[pix[0]];
 166             s += sq[pix[1]];
 167             s += sq[pix[2]];
 168             s += sq[pix[3]];
 169             s += sq[pix[4]];
 170             s += sq[pix[5]];
 171             s += sq[pix[6]];
 172             s += sq[pix[7]];
 173 #else
 174 #if LONG_MAX > 2147483647
 175             register uint64_t x=*(uint64_t*)pix;
 176             s += sq[x&0xff];
 177             s += sq[(x>>8)&0xff];
 178             s += sq[(x>>16)&0xff];
 179             s += sq[(x>>24)&0xff];
 180             s += sq[(x>>32)&0xff];
 181             s += sq[(x>>40)&0xff];
 182             s += sq[(x>>48)&0xff];
 183             s += sq[(x>>56)&0xff];
 184 #else
 185             register uint32_t x=*(uint32_t*)pix;
 186             s += sq[x&0xff];
 187             s += sq[(x>>8)&0xff];
 188             s += sq[(x>>16)&0xff];
 189             s += sq[(x>>24)&0xff];
 190             x=*(uint32_t*)(pix+4);
 191             s += sq[x&0xff];
 192             s += sq[(x>>8)&0xff];
 193             s += sq[(x>>16)&0xff];
 194             s += sq[(x>>24)&0xff];
 195 #endif
 196 #endif
 197             pix += 8;
 198         }
 199         pix += line_size - 16;
 200     }
 201     return s;
 202 }
 203
 204 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 205     int i;
 206
 207     for(i=0; i+8<=w; i+=8){
 208         dst[i+0]= bswap_32(src[i+0]);
 209         dst[i+1]= bswap_32(src[i+1]);
 210         dst[i+2]= bswap_32(src[i+2]);
 211         dst[i+3]= bswap_32(src[i+3]);
 212         dst[i+4]= bswap_32(src[i+4]);
 213         dst[i+5]= bswap_32(src[i+5]);
 214         dst[i+6]= bswap_32(src[i+6]);
 215         dst[i+7]= bswap_32(src[i+7]);
 216     }
 217     for(;i<w; i++){
 218         dst[i+0]= bswap_32(src[i+0]);
 219     }
 220 }
 221
 222 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 223 {
 224     int s, i;
 225     uint32_t *sq = squareTbl + 256;
 226
 227     s = 0;
 228     for (i = 0; i < h; i++) {
 229         s += sq[pix1[0] - pix2[0]];
 230         s += sq[pix1[1] - pix2[1]];
 231         s += sq[pix1[2] - pix2[2]];
 232         s += sq[pix1[3] - pix2[3]];
 233         s += sq[pix1[4] - pix2[4]];
 234         s += sq[pix1[5] - pix2[5]];
 235         s += sq[pix1[6] - pix2[6]];
 236         s += sq[pix1[7] - pix2[7]];
 237         pix1 += line_size;
 238         pix2 += line_size;
 239     }
 240     return s;
 241 }
 242
 243 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 244 {
 245     int s, i;
 246     uint32_t *sq = squareTbl + 256;
 247
 248     s = 0;
 249     for (i = 0; i < h; i++) {
 250         s += sq[pix1[ 0] - pix2[ 0]];
 251         s += sq[pix1[ 1] - pix2[ 1]];
 252         s += sq[pix1[ 2] - pix2[ 2]];
 253         s += sq[pix1[ 3] - pix2[ 3]];
 254         s += sq[pix1[ 4] - pix2[ 4]];
 255         s += sq[pix1[ 5] - pix2[ 5]];
 256         s += sq[pix1[ 6] - pix2[ 6]];
 257         s += sq[pix1[ 7] - pix2[ 7]];
 258         s += sq[pix1[ 8] - pix2[ 8]];
 259         s += sq[pix1[ 9] - pix2[ 9]];
 260         s += sq[pix1[10] - pix2[10]];
 261         s += sq[pix1[11] - pix2[11]];
 262         s += sq[pix1[12] - pix2[12]];
 263         s += sq[pix1[13] - pix2[13]];
 264         s += sq[pix1[14] - pix2[14]];
 265         s += sq[pix1[15] - pix2[15]];
 266
 267         pix1 += line_size;
 268         pix2 += line_size;
 269     }
 270     return s;
 271 }
 272
 273 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 274 {
 275     int i;
 276
 277     /* read the pixels */
 278     for(i=0;i<8;i++) {
 279         block[0] = pixels[0];
 280         block[1] = pixels[1];
 281         block[2] = pixels[2];
 282         block[3] = pixels[3];
 283         block[4] = pixels[4];
 284         block[5] = pixels[5];
 285         block[6] = pixels[6];
 286         block[7] = pixels[7];
 287         pixels += line_size;
 288         block += 8;
 289     }
 290 }
 291
 292 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 293                           const uint8_t *s2, int stride){
 294     int i;
 295
 296     /* read the pixels */
 297     for(i=0;i<8;i++) {
 298         block[0] = s1[0] - s2[0];
 299         block[1] = s1[1] - s2[1];
 300         block[2] = s1[2] - s2[2];
 301         block[3] = s1[3] - s2[3];
 302         block[4] = s1[4] - s2[4];
 303         block[5] = s1[5] - s2[5];
 304         block[6] = s1[6] - s2[6];
 305         block[7] = s1[7] - s2[7];
 306         s1 += stride;
 307         s2 += stride;
 308         block += 8;
 309     }
 310 }
 311
 312
 313 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 314                                  int line_size)
 315 {
 316     int i;
 317     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 318
 319     /* read the pixels */
 320     for(i=0;i<8;i++) {
 321         pixels[0] = cm[block[0]];
 322         pixels[1] = cm[block[1]];
 323         pixels[2] = cm[block[2]];
 324         pixels[3] = cm[block[3]];
 325         pixels[4] = cm[block[4]];
 326         pixels[5] = cm[block[5]];
 327         pixels[6] = cm[block[6]];
 328         pixels[7] = cm[block[7]];
 329
 330         pixels += line_size;
 331         block += 8;
 332     }
 333 }
 334
 335 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 336                                         uint8_t *restrict pixels,
 337                                         int line_size)
 338 {
 339     int i, j;
 340
 341     for (i = 0; i < 8; i++) {
 342         for (j = 0; j < 8; j++) {
 343             if (*block < -128)
 344                 *pixels = 0;
 345             else if (*block > 127)
 346                 *pixels = 255;
 347             else
 348                 *pixels = (uint8_t)(*block + 128);
 349             block++;
 350             pixels++;
 351         }
 352         pixels += (line_size - 8);
 353     }
 354 }
 355
 356 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 357                           int line_size)
 358 {
 359     int i;
 360     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 361
 362     /* read the pixels */
 363     for(i=0;i<8;i++) {
 364         pixels[0] = cm[pixels[0] + block[0]];
 365         pixels[1] = cm[pixels[1] + block[1]];
 366         pixels[2] = cm[pixels[2] + block[2]];
 367         pixels[3] = cm[pixels[3] + block[3]];
 368         pixels[4] = cm[pixels[4] + block[4]];
 369         pixels[5] = cm[pixels[5] + block[5]];
 370         pixels[6] = cm[pixels[6] + block[6]];
 371         pixels[7] = cm[pixels[7] + block[7]];
 372         pixels += line_size;
 373         block += 8;
 374     }
 375 }
 376 #if 0
 377
 378 #define PIXOP2(OPNAME, OP) \
 379 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 380 {\
 381     int i;\
 382     for(i=0; i<h; i++){\
 383         OP(*((uint64_t*)block), LD64(pixels));\
 384         pixels+=line_size;\
 385         block +=line_size;\
 386     }\
 387 }\
 388 \
 389 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 390 {\
 391     int i;\
 392     for(i=0; i<h; i++){\
 393         const uint64_t a= LD64(pixels  );\
 394         const uint64_t b= LD64(pixels+1);\
 395         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 396         pixels+=line_size;\
 397         block +=line_size;\
 398     }\
 399 }\
 400 \
 401 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 402 {\
 403     int i;\
 404     for(i=0; i<h; i++){\
 405         const uint64_t a= LD64(pixels  );\
 406         const uint64_t b= LD64(pixels+1);\
 407         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 408         pixels+=line_size;\
 409         block +=line_size;\
 410     }\
 411 }\
 412 \
 413 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 414 {\
 415     int i;\
 416     for(i=0; i<h; i++){\
 417         const uint64_t a= LD64(pixels          );\
 418         const uint64_t b= LD64(pixels+line_size);\
 419         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 420         pixels+=line_size;\
 421         block +=line_size;\
 422     }\
 423 }\
 424 \
 425 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 426 {\
 427     int i;\
 428     for(i=0; i<h; i++){\
 429         const uint64_t a= LD64(pixels          );\
 430         const uint64_t b= LD64(pixels+line_size);\
 431         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 432         pixels+=line_size;\
 433         block +=line_size;\
 434     }\
 435 }\
 436 \
 437 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 438 {\
 439         int i;\
 440         const uint64_t a= LD64(pixels  );\
 441         const uint64_t b= LD64(pixels+1);\
 442         uint64_t l0=  (a&0x0303030303030303ULL)\
 443                     + (b&0x0303030303030303ULL)\
 444                     + 0x0202020202020202ULL;\
 445         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 446                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 447         uint64_t l1,h1;\
 448 \
 449         pixels+=line_size;\
 450         for(i=0; i<h; i+=2){\
 451             uint64_t a= LD64(pixels  );\
 452             uint64_t b= LD64(pixels+1);\
 453             l1=  (a&0x0303030303030303ULL)\
 454                + (b&0x0303030303030303ULL);\
 455             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 456               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 457             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 458             pixels+=line_size;\
 459             block +=line_size;\
 460             a= LD64(pixels  );\
 461             b= LD64(pixels+1);\
 462             l0=  (a&0x0303030303030303ULL)\
 463                + (b&0x0303030303030303ULL)\
 464                + 0x0202020202020202ULL;\
 465             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 466               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 467             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 468             pixels+=line_size;\
 469             block +=line_size;\
 470         }\
 471 }\
 472 \
 473 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 474 {\
 475         int i;\
 476         const uint64_t a= LD64(pixels  );\
 477         const uint64_t b= LD64(pixels+1);\
 478         uint64_t l0=  (a&0x0303030303030303ULL)\
 479                     + (b&0x0303030303030303ULL)\
 480                     + 0x0101010101010101ULL;\
 481         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 482                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 483         uint64_t l1,h1;\
 484 \
 485         pixels+=line_size;\
 486         for(i=0; i<h; i+=2){\
 487             uint64_t a= LD64(pixels  );\
 488             uint64_t b= LD64(pixels+1);\
 489             l1=  (a&0x0303030303030303ULL)\
 490                + (b&0x0303030303030303ULL);\
 491             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 492               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 493             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 494             pixels+=line_size;\
 495             block +=line_size;\
 496             a= LD64(pixels  );\
 497             b= LD64(pixels+1);\
 498             l0=  (a&0x0303030303030303ULL)\
 499                + (b&0x0303030303030303ULL)\
 500                + 0x0101010101010101ULL;\
 501             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 502               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 503             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 504             pixels+=line_size;\
 505             block +=line_size;\
 506         }\
 507 }\
 508 \
 509 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 510 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 511 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 512 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 513 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 514 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 515 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 516
 517 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 518 #else // 64 bit variant
 519
 520 #define PIXOP2(OPNAME, OP) \
 521 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 522     int i;\
 523     for(i=0; i<h; i++){\
 524         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
 525         pixels+=line_size;\
 526         block +=line_size;\
 527     }\
 528 }\
 529 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 530     int i;\
 531     for(i=0; i<h; i++){\
 532         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 533         pixels+=line_size;\
 534         block +=line_size;\
 535     }\
 536 }\
 537 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 538     int i;\
 539     for(i=0; i<h; i++){\
 540         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 541         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 542         pixels+=line_size;\
 543         block +=line_size;\
 544     }\
 545 }\
 546 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 547     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 548 }\
 549 \
 550 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 551                                                 int src_stride1, int src_stride2, int h){\
 552     int i;\
 553     for(i=0; i<h; i++){\
 554         uint32_t a,b;\
 555         a= LD32(&src1[i*src_stride1  ]);\
 556         b= LD32(&src2[i*src_stride2  ]);\
 557         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 558         a= LD32(&src1[i*src_stride1+4]);\
 559         b= LD32(&src2[i*src_stride2+4]);\
 560         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 561     }\
 562 }\
 563 \
 564 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 565                                                 int src_stride1, int src_stride2, int h){\
 566     int i;\
 567     for(i=0; i<h; i++){\
 568         uint32_t a,b;\
 569         a= LD32(&src1[i*src_stride1  ]);\
 570         b= LD32(&src2[i*src_stride2  ]);\
 571         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 572         a= LD32(&src1[i*src_stride1+4]);\
 573         b= LD32(&src2[i*src_stride2+4]);\
 574         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 575     }\
 576 }\
 577 \
 578 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 579                                                 int src_stride1, int src_stride2, int h){\
 580     int i;\
 581     for(i=0; i<h; i++){\
 582         uint32_t a,b;\
 583         a= LD32(&src1[i*src_stride1  ]);\
 584         b= LD32(&src2[i*src_stride2  ]);\
 585         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 586     }\
 587 }\
 588 \
 589 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 590                                                 int src_stride1, int src_stride2, int h){\
 591     int i;\
 592     for(i=0; i<h; i++){\
 593         uint32_t a,b;\
 594         a= LD16(&src1[i*src_stride1  ]);\
 595         b= LD16(&src2[i*src_stride2  ]);\
 596         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 597     }\
 598 }\
 599 \
 600 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 601                                                 int src_stride1, int src_stride2, int h){\
 602     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 603     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 604 }\
 605 \
 606 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 607                                                 int src_stride1, int src_stride2, int h){\
 608     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 609     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 610 }\
 611 \
 612 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 613     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 614 }\
 615 \
 616 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 617     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 618 }\
 619 \
 620 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 621     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 622 }\
 623 \
 624 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 625     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 626 }\
 627 \
 628 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 629                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 630     int i;\
 631     for(i=0; i<h; i++){\
 632         uint32_t a, b, c, d, l0, l1, h0, h1;\
 633         a= LD32(&src1[i*src_stride1]);\
 634         b= LD32(&src2[i*src_stride2]);\
 635         c= LD32(&src3[i*src_stride3]);\
 636         d= LD32(&src4[i*src_stride4]);\
 637         l0=  (a&0x03030303UL)\
 638            + (b&0x03030303UL)\
 639            + 0x02020202UL;\
 640         h0= ((a&0xFCFCFCFCUL)>>2)\
 641           + ((b&0xFCFCFCFCUL)>>2);\
 642         l1=  (c&0x03030303UL)\
 643            + (d&0x03030303UL);\
 644         h1= ((c&0xFCFCFCFCUL)>>2)\
 645           + ((d&0xFCFCFCFCUL)>>2);\
 646         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 647         a= LD32(&src1[i*src_stride1+4]);\
 648         b= LD32(&src2[i*src_stride2+4]);\
 649         c= LD32(&src3[i*src_stride3+4]);\
 650         d= LD32(&src4[i*src_stride4+4]);\
 651         l0=  (a&0x03030303UL)\
 652            + (b&0x03030303UL)\
 653            + 0x02020202UL;\
 654         h0= ((a&0xFCFCFCFCUL)>>2)\
 655           + ((b&0xFCFCFCFCUL)>>2);\
 656         l1=  (c&0x03030303UL)\
 657            + (d&0x03030303UL);\
 658         h1= ((c&0xFCFCFCFCUL)>>2)\
 659           + ((d&0xFCFCFCFCUL)>>2);\
 660         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 661     }\
 662 }\
 663 \
 664 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 665     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 666 }\
 667 \
 668 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 669     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 670 }\
 671 \
 672 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 673     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 674 }\
 675 \
 676 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 677     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 678 }\
 679 \
 680 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 681                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 682     int i;\
 683     for(i=0; i<h; i++){\
 684         uint32_t a, b, c, d, l0, l1, h0, h1;\
 685         a= LD32(&src1[i*src_stride1]);\
 686         b= LD32(&src2[i*src_stride2]);\
 687         c= LD32(&src3[i*src_stride3]);\
 688         d= LD32(&src4[i*src_stride4]);\
 689         l0=  (a&0x03030303UL)\
 690            + (b&0x03030303UL)\
 691            + 0x01010101UL;\
 692         h0= ((a&0xFCFCFCFCUL)>>2)\
 693           + ((b&0xFCFCFCFCUL)>>2);\
 694         l1=  (c&0x03030303UL)\
 695            + (d&0x03030303UL);\
 696         h1= ((c&0xFCFCFCFCUL)>>2)\
 697           + ((d&0xFCFCFCFCUL)>>2);\
 698         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 699         a= LD32(&src1[i*src_stride1+4]);\
 700         b= LD32(&src2[i*src_stride2+4]);\
 701         c= LD32(&src3[i*src_stride3+4]);\
 702         d= LD32(&src4[i*src_stride4+4]);\
 703         l0=  (a&0x03030303UL)\
 704            + (b&0x03030303UL)\
 705            + 0x01010101UL;\
 706         h0= ((a&0xFCFCFCFCUL)>>2)\
 707           + ((b&0xFCFCFCFCUL)>>2);\
 708         l1=  (c&0x03030303UL)\
 709            + (d&0x03030303UL);\
 710         h1= ((c&0xFCFCFCFCUL)>>2)\
 711           + ((d&0xFCFCFCFCUL)>>2);\
 712         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 713     }\
 714 }\
 715 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 716                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 717     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 718     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 719 }\
 720 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 721                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 722     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 723     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 724 }\
 725 \
 726 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 727 {\
 728         int i, a0, b0, a1, b1;\
 729         a0= pixels[0];\
 730         b0= pixels[1] + 2;\
 731         a0 += b0;\
 732         b0 += pixels[2];\
 733 \
 734         pixels+=line_size;\
 735         for(i=0; i<h; i+=2){\
 736             a1= pixels[0];\
 737             b1= pixels[1];\
 738             a1 += b1;\
 739             b1 += pixels[2];\
 740 \
 741             block[0]= (a1+a0)>>2; /* FIXME non put */\
 742             block[1]= (b1+b0)>>2;\
 743 \
 744             pixels+=line_size;\
 745             block +=line_size;\
 746 \
 747             a0= pixels[0];\
 748             b0= pixels[1] + 2;\
 749             a0 += b0;\
 750             b0 += pixels[2];\
 751 \
 752             block[0]= (a1+a0)>>2;\
 753             block[1]= (b1+b0)>>2;\
 754             pixels+=line_size;\
 755             block +=line_size;\
 756         }\
 757 }\
 758 \
 759 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 760 {\
 761         int i;\
 762         const uint32_t a= LD32(pixels  );\
 763         const uint32_t b= LD32(pixels+1);\
 764         uint32_t l0=  (a&0x03030303UL)\
 765                     + (b&0x03030303UL)\
 766                     + 0x02020202UL;\
 767         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 768                    + ((b&0xFCFCFCFCUL)>>2);\
 769         uint32_t l1,h1;\
 770 \
 771         pixels+=line_size;\
 772         for(i=0; i<h; i+=2){\
 773             uint32_t a= LD32(pixels  );\
 774             uint32_t b= LD32(pixels+1);\
 775             l1=  (a&0x03030303UL)\
 776                + (b&0x03030303UL);\
 777             h1= ((a&0xFCFCFCFCUL)>>2)\
 778               + ((b&0xFCFCFCFCUL)>>2);\
 779             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 780             pixels+=line_size;\
 781             block +=line_size;\
 782             a= LD32(pixels  );\
 783             b= LD32(pixels+1);\
 784             l0=  (a&0x03030303UL)\
 785                + (b&0x03030303UL)\
 786                + 0x02020202UL;\
 787             h0= ((a&0xFCFCFCFCUL)>>2)\
 788               + ((b&0xFCFCFCFCUL)>>2);\
 789             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 790             pixels+=line_size;\
 791             block +=line_size;\
 792         }\
 793 }\
 794 \
 795 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 796 {\
 797     int j;\
 798     for(j=0; j<2; j++){\
 799         int i;\
 800         const uint32_t a= LD32(pixels  );\
 801         const uint32_t b= LD32(pixels+1);\
 802         uint32_t l0=  (a&0x03030303UL)\
 803                     + (b&0x03030303UL)\
 804                     + 0x02020202UL;\
 805         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 806                    + ((b&0xFCFCFCFCUL)>>2);\
 807         uint32_t l1,h1;\
 808 \
 809         pixels+=line_size;\
 810         for(i=0; i<h; i+=2){\
 811             uint32_t a= LD32(pixels  );\
 812             uint32_t b= LD32(pixels+1);\
 813             l1=  (a&0x03030303UL)\
 814                + (b&0x03030303UL);\
 815             h1= ((a&0xFCFCFCFCUL)>>2)\
 816               + ((b&0xFCFCFCFCUL)>>2);\
 817             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 818             pixels+=line_size;\
 819             block +=line_size;\
 820             a= LD32(pixels  );\
 821             b= LD32(pixels+1);\
 822             l0=  (a&0x03030303UL)\
 823                + (b&0x03030303UL)\
 824                + 0x02020202UL;\
 825             h0= ((a&0xFCFCFCFCUL)>>2)\
 826               + ((b&0xFCFCFCFCUL)>>2);\
 827             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 828             pixels+=line_size;\
 829             block +=line_size;\
 830         }\
 831         pixels+=4-line_size*(h+1);\
 832         block +=4-line_size*h;\
 833     }\
 834 }\
 835 \
 836 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 837 {\
 838     int j;\
 839     for(j=0; j<2; j++){\
 840         int i;\
 841         const uint32_t a= LD32(pixels  );\
 842         const uint32_t b= LD32(pixels+1);\
 843         uint32_t l0=  (a&0x03030303UL)\
 844                     + (b&0x03030303UL)\
 845                     + 0x01010101UL;\
 846         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 847                    + ((b&0xFCFCFCFCUL)>>2);\
 848         uint32_t l1,h1;\
 849 \
 850         pixels+=line_size;\
 851         for(i=0; i<h; i+=2){\
 852             uint32_t a= LD32(pixels  );\
 853             uint32_t b= LD32(pixels+1);\
 854             l1=  (a&0x03030303UL)\
 855                + (b&0x03030303UL);\
 856             h1= ((a&0xFCFCFCFCUL)>>2)\
 857               + ((b&0xFCFCFCFCUL)>>2);\
 858             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 859             pixels+=line_size;\
 860             block +=line_size;\
 861             a= LD32(pixels  );\
 862             b= LD32(pixels+1);\
 863             l0=  (a&0x03030303UL)\
 864                + (b&0x03030303UL)\
 865                + 0x01010101UL;\
 866             h0= ((a&0xFCFCFCFCUL)>>2)\
 867               + ((b&0xFCFCFCFCUL)>>2);\
 868             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 869             pixels+=line_size;\
 870             block +=line_size;\
 871         }\
 872         pixels+=4-line_size*(h+1);\
 873         block +=4-line_size*h;\
 874     }\
 875 }\
 876 \
 877 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
 878 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
 879 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
 880 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
 881 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
 882 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
 883 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
 884 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
 885
 886 #define op_avg(a, b) a = rnd_avg32(a, b)
 887 #endif
 888 #define op_put(a, b) a = b
 889
 890 PIXOP2(avg, op_avg)
 891 PIXOP2(put, op_put)
 892 #undef op_avg
 893 #undef op_put
 894
 895 #define avg2(a,b) ((a+b+1)>>1)
 896 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 897
 898 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
 899     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
 900 }
 901
 902 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
 903     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
 904 }
 905
 906 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 907 {
 908     const int A=(16-x16)*(16-y16);
 909     const int B=(   x16)*(16-y16);
 910     const int C=(16-x16)*(   y16);
 911     const int D=(   x16)*(   y16);
 912     int i;
 913
 914     for(i=0; i<h; i++)
 915     {
 916         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 917         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 918         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 919         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 920         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 921         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 922         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 923         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 924         dst+= stride;
 925         src+= stride;
 926     }
 927 }
 928
 929 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 930                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 931 {
 932     int y, vx, vy;
 933     const int s= 1<<shift;
 934
 935     width--;
 936     height--;
 937
 938     for(y=0; y<h; y++){
 939         int x;
 940
 941         vx= ox;
 942         vy= oy;
 943         for(x=0; x<8; x++){ //XXX FIXME optimize
 944             int src_x, src_y, frac_x, frac_y, index;
 945
 946             src_x= vx>>16;
 947             src_y= vy>>16;
 948             frac_x= src_x&(s-1);
 949             frac_y= src_y&(s-1);
 950             src_x>>=shift;
 951             src_y>>=shift;
 952
 953             if((unsigned)src_x < width){
 954                 if((unsigned)src_y < height){
 955                     index= src_x + src_y*stride;
 956                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 957                                            + src[index       +1]*   frac_x )*(s-frac_y)
 958                                         + (  src[index+stride  ]*(s-frac_x)
 959                                            + src[index+stride+1]*   frac_x )*   frac_y
 960                                         + r)>>(shift*2);
 961                 }else{
 962                     index= src_x + clip(src_y, 0, height)*stride;
 963                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 964                                           + src[index       +1]*   frac_x )*s
 965                                         + r)>>(shift*2);
 966                 }
 967             }else{
 968                 if((unsigned)src_y < height){
 969                     index= clip(src_x, 0, width) + src_y*stride;
 970                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 971                                            + src[index+stride  ]*   frac_y )*s
 972                                         + r)>>(shift*2);
 973                 }else{
 974                     index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
 975                     dst[y*stride + x]=    src[index         ];
 976                 }
 977             }
 978
 979             vx+= dxx;
 980             vy+= dyx;
 981         }
 982         ox += dxy;
 983         oy += dyy;
 984     }
 985 }
 986
 987 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 988     switch(width){
 989     case 2: put_pixels2_c (dst, src, stride, height); break;
 990     case 4: put_pixels4_c (dst, src, stride, height); break;
 991     case 8: put_pixels8_c (dst, src, stride, height); break;
 992     case 16:put_pixels16_c(dst, src, stride, height); break;
 993     }
 994 }
 995
 996 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 997     int i,j;
 998     for (i=0; i < height; i++) {
 999       for (j=0; j < width; j++) {
1000         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1001       }
1002       src += stride;
1003       dst += stride;
1004     }
1005 }
1006
1007 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1008     int i,j;
1009     for (i=0; i < height; i++) {
1010       for (j=0; j < width; j++) {
1011         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1012       }
1013       src += stride;
1014       dst += stride;
1015     }
1016 }
1017
1018 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1019     int i,j;
1020     for (i=0; i < height; i++) {
1021       for (j=0; j < width; j++) {
1022         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1023       }
1024       src += stride;
1025       dst += stride;
1026     }
1027 }
1028
1029 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1030     int i,j;
1031     for (i=0; i < height; i++) {
1032       for (j=0; j < width; j++) {
1033         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1034       }
1035       src += stride;
1036       dst += stride;
1037     }
1038 }
1039
1040 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1041     int i,j;
1042     for (i=0; i < height; i++) {
1043       for (j=0; j < width; j++) {
1044         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1045       }
1046       src += stride;
1047       dst += stride;
1048     }
1049 }
1050
1051 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1052     int i,j;
1053     for (i=0; i < height; i++) {
1054       for (j=0; j < width; j++) {
1055         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1056       }
1057       src += stride;
1058       dst += stride;
1059     }
1060 }
1061
1062 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1063     int i,j;
1064     for (i=0; i < height; i++) {
1065       for (j=0; j < width; j++) {
1066         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1067       }
1068       src += stride;
1069       dst += stride;
1070     }
1071 }
1072
1073 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1074     int i,j;
1075     for (i=0; i < height; i++) {
1076       for (j=0; j < width; j++) {
1077         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1078       }
1079       src += stride;
1080       dst += stride;
1081     }
1082 }
1083
1084 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1085     switch(width){
1086     case 2: avg_pixels2_c (dst, src, stride, height); break;
1087     case 4: avg_pixels4_c (dst, src, stride, height); break;
1088     case 8: avg_pixels8_c (dst, src, stride, height); break;
1089     case 16:avg_pixels16_c(dst, src, stride, height); break;
1090     }
1091 }
1092
1093 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1094     int i,j;
1095     for (i=0; i < height; i++) {
1096       for (j=0; j < width; j++) {
1097         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1098       }
1099       src += stride;
1100       dst += stride;
1101     }
1102 }
1103
1104 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1105     int i,j;
1106     for (i=0; i < height; i++) {
1107       for (j=0; j < width; j++) {
1108         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1109       }
1110       src += stride;
1111       dst += stride;
1112     }
1113 }
1114
1115 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1116     int i,j;
1117     for (i=0; i < height; i++) {
1118       for (j=0; j < width; j++) {
1119         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1120       }
1121       src += stride;
1122       dst += stride;
1123     }
1124 }
1125
1126 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1127     int i,j;
1128     for (i=0; i < height; i++) {
1129       for (j=0; j < width; j++) {
1130         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1131       }
1132       src += stride;
1133       dst += stride;
1134     }
1135 }
1136
1137 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1138     int i,j;
1139     for (i=0; i < height; i++) {
1140       for (j=0; j < width; j++) {
1141         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1142       }
1143       src += stride;
1144       dst += stride;
1145     }
1146 }
1147
1148 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1149     int i,j;
1150     for (i=0; i < height; i++) {
1151       for (j=0; j < width; j++) {
1152         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1153       }
1154       src += stride;
1155       dst += stride;
1156     }
1157 }
1158
1159 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1160     int i,j;
1161     for (i=0; i < height; i++) {
1162       for (j=0; j < width; j++) {
1163         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1164       }
1165       src += stride;
1166       dst += stride;
1167     }
1168 }
1169
1170 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1171     int i,j;
1172     for (i=0; i < height; i++) {
1173       for (j=0; j < width; j++) {
1174         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1175       }
1176       src += stride;
1177       dst += stride;
1178     }
1179 }
1180 #if 0
1181 #define TPEL_WIDTH(width)\
1182 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1183     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1184 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1185     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1186 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1187     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1188 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1189     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1190 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1191     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1192 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1193     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1194 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1195     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1196 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1197     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1198 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1199     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1200 #endif
1201
1202 #define H264_CHROMA_MC(OPNAME, OP)\
1203 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1204     const int A=(8-x)*(8-y);\
1205     const int B=(  x)*(8-y);\
1206     const int C=(8-x)*(  y);\
1207     const int D=(  x)*(  y);\
1208     int i;\
1209     \
1210     assert(x<8 && y<8 && x>=0 && y>=0);\
1211 \
1212     for(i=0; i<h; i++)\
1213     {\
1214         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1215         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1216         dst+= stride;\
1217         src+= stride;\
1218     }\
1219 }\
1220 \
1221 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1222     const int A=(8-x)*(8-y);\
1223     const int B=(  x)*(8-y);\
1224     const int C=(8-x)*(  y);\
1225     const int D=(  x)*(  y);\
1226     int i;\
1227     \
1228     assert(x<8 && y<8 && x>=0 && y>=0);\
1229 \
1230     for(i=0; i<h; i++)\
1231     {\
1232         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1233         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1234         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1235         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1236         dst+= stride;\
1237         src+= stride;\
1238     }\
1239 }\
1240 \
1241 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1242     const int A=(8-x)*(8-y);\
1243     const int B=(  x)*(8-y);\
1244     const int C=(8-x)*(  y);\
1245     const int D=(  x)*(  y);\
1246     int i;\
1247     \
1248     assert(x<8 && y<8 && x>=0 && y>=0);\
1249 \
1250     for(i=0; i<h; i++)\
1251     {\
1252         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1253         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1254         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1255         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1256         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1257         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1258         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1259         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1260         dst+= stride;\
1261         src+= stride;\
1262     }\
1263 }
1264
1265 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1266 #define op_put(a, b) a = (((b) + 32)>>6)
1267
1268 H264_CHROMA_MC(put_       , op_put)
1269 H264_CHROMA_MC(avg_       , op_avg)
1270 #undef op_avg
1271 #undef op_put
1272
1273 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1274 {
1275     int i;
1276     for(i=0; i<h; i++)
1277     {
1278         ST32(dst   , LD32(src   ));
1279         dst+=dstStride;
1280         src+=srcStride;
1281     }
1282 }
1283
1284 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1285 {
1286     int i;
1287     for(i=0; i<h; i++)
1288     {
1289         ST32(dst   , LD32(src   ));
1290         ST32(dst+4 , LD32(src+4 ));
1291         dst+=dstStride;
1292         src+=srcStride;
1293     }
1294 }
1295
1296 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1297 {
1298     int i;
1299     for(i=0; i<h; i++)
1300     {
1301         ST32(dst   , LD32(src   ));
1302         ST32(dst+4 , LD32(src+4 ));
1303         ST32(dst+8 , LD32(src+8 ));
1304         ST32(dst+12, LD32(src+12));
1305         dst+=dstStride;
1306         src+=srcStride;
1307     }
1308 }
1309
1310 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1311 {
1312     int i;
1313     for(i=0; i<h; i++)
1314     {
1315         ST32(dst   , LD32(src   ));
1316         ST32(dst+4 , LD32(src+4 ));
1317         ST32(dst+8 , LD32(src+8 ));
1318         ST32(dst+12, LD32(src+12));
1319         dst[16]= src[16];
1320         dst+=dstStride;
1321         src+=srcStride;
1322     }
1323 }
1324
1325 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1326 {
1327     int i;
1328     for(i=0; i<h; i++)
1329     {
1330         ST32(dst   , LD32(src   ));
1331         ST32(dst+4 , LD32(src+4 ));
1332         dst[8]= src[8];
1333         dst+=dstStride;
1334         src+=srcStride;
1335     }
1336 }
1337
1338
1339 #define QPEL_MC(r, OPNAME, RND, OP) \
1340 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1341     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1342     int i;\
1343     for(i=0; i<h; i++)\
1344     {\
1345         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1346         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1347         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1348         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1349         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1350         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1351         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1352         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1353         dst+=dstStride;\
1354         src+=srcStride;\
1355     }\
1356 }\
1357 \
1358 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1359     const int w=8;\
1360     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1361     int i;\
1362     for(i=0; i<w; i++)\
1363     {\
1364         const int src0= src[0*srcStride];\
1365         const int src1= src[1*srcStride];\
1366         const int src2= src[2*srcStride];\
1367         const int src3= src[3*srcStride];\
1368         const int src4= src[4*srcStride];\
1369         const int src5= src[5*srcStride];\
1370         const int src6= src[6*srcStride];\
1371         const int src7= src[7*srcStride];\
1372         const int src8= src[8*srcStride];\
1373         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1374         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1375         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1376         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1377         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1378         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1379         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1380         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1381         dst++;\
1382         src++;\
1383     }\
1384 }\
1385 \
1386 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1387     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1388     int i;\
1389     \
1390     for(i=0; i<h; i++)\
1391     {\
1392         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1393         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1394         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1395         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1396         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1397         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1398         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1399         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1400         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1401         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1402         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1403         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1404         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1405         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1406         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1407         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1408         dst+=dstStride;\
1409         src+=srcStride;\
1410     }\
1411 }\
1412 \
1413 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1414     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1415     int i;\
1416     const int w=16;\
1417     for(i=0; i<w; i++)\
1418     {\
1419         const int src0= src[0*srcStride];\
1420         const int src1= src[1*srcStride];\
1421         const int src2= src[2*srcStride];\
1422         const int src3= src[3*srcStride];\
1423         const int src4= src[4*srcStride];\
1424         const int src5= src[5*srcStride];\
1425         const int src6= src[6*srcStride];\
1426         const int src7= src[7*srcStride];\
1427         const int src8= src[8*srcStride];\
1428         const int src9= src[9*srcStride];\
1429         const int src10= src[10*srcStride];\
1430         const int src11= src[11*srcStride];\
1431         const int src12= src[12*srcStride];\
1432         const int src13= src[13*srcStride];\
1433         const int src14= src[14*srcStride];\
1434         const int src15= src[15*srcStride];\
1435         const int src16= src[16*srcStride];\
1436         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1437         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1438         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1439         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1440         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1441         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1442         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1443         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1444         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1445         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1446         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1447         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1448         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1449         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1450         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1451         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1452         dst++;\
1453         src++;\
1454     }\
1455 }\
1456 \
1457 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1458     OPNAME ## pixels8_c(dst, src, stride, 8);\
1459 }\
1460 \
1461 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1462     uint8_t half[64];\
1463     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1464     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1465 }\
1466 \
1467 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1468     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1469 }\
1470 \
1471 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1472     uint8_t half[64];\
1473     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1474     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1475 }\
1476 \
1477 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1478     uint8_t full[16*9];\
1479     uint8_t half[64];\
1480     copy_block9(full, src, 16, stride, 9);\
1481     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1482     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1483 }\
1484 \
1485 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1486     uint8_t full[16*9];\
1487     copy_block9(full, src, 16, stride, 9);\
1488     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1489 }\
1490 \
1491 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1492     uint8_t full[16*9];\
1493     uint8_t half[64];\
1494     copy_block9(full, src, 16, stride, 9);\
1495     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1496     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1497 }\
1498 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1499     uint8_t full[16*9];\
1500     uint8_t halfH[72];\
1501     uint8_t halfV[64];\
1502     uint8_t halfHV[64];\
1503     copy_block9(full, src, 16, stride, 9);\
1504     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1505     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1506     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1507     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1508 }\
1509 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1510     uint8_t full[16*9];\
1511     uint8_t halfH[72];\
1512     uint8_t halfHV[64];\
1513     copy_block9(full, src, 16, stride, 9);\
1514     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1515     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1516     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1517     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1518 }\
1519 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1520     uint8_t full[16*9];\
1521     uint8_t halfH[72];\
1522     uint8_t halfV[64];\
1523     uint8_t halfHV[64];\
1524     copy_block9(full, src, 16, stride, 9);\
1525     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1526     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1527     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1528     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1529 }\
1530 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1531     uint8_t full[16*9];\
1532     uint8_t halfH[72];\
1533     uint8_t halfHV[64];\
1534     copy_block9(full, src, 16, stride, 9);\
1535     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1536     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1537     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1538     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1539 }\
1540 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1541     uint8_t full[16*9];\
1542     uint8_t halfH[72];\
1543     uint8_t halfV[64];\
1544     uint8_t halfHV[64];\
1545     copy_block9(full, src, 16, stride, 9);\
1546     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1547     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1548     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1549     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1550 }\
1551 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1552     uint8_t full[16*9];\
1553     uint8_t halfH[72];\
1554     uint8_t halfHV[64];\
1555     copy_block9(full, src, 16, stride, 9);\
1556     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1557     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1558     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1559     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1560 }\
1561 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1562     uint8_t full[16*9];\
1563     uint8_t halfH[72];\
1564     uint8_t halfV[64];\
1565     uint8_t halfHV[64];\
1566     copy_block9(full, src, 16, stride, 9);\
1567     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1568     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1569     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1570     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1571 }\
1572 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1573     uint8_t full[16*9];\
1574     uint8_t halfH[72];\
1575     uint8_t halfHV[64];\
1576     copy_block9(full, src, 16, stride, 9);\
1577     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1578     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1579     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1580     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1581 }\
1582 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1583     uint8_t halfH[72];\
1584     uint8_t halfHV[64];\
1585     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1586     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1587     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1588 }\
1589 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1590     uint8_t halfH[72];\
1591     uint8_t halfHV[64];\
1592     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1593     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1594     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1595 }\
1596 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1597     uint8_t full[16*9];\
1598     uint8_t halfH[72];\
1599     uint8_t halfV[64];\
1600     uint8_t halfHV[64];\
1601     copy_block9(full, src, 16, stride, 9);\
1602     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1603     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1604     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1605     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1606 }\
1607 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1608     uint8_t full[16*9];\
1609     uint8_t halfH[72];\
1610     copy_block9(full, src, 16, stride, 9);\
1611     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1612     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1613     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1614 }\
1615 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1616     uint8_t full[16*9];\
1617     uint8_t halfH[72];\
1618     uint8_t halfV[64];\
1619     uint8_t halfHV[64];\
1620     copy_block9(full, src, 16, stride, 9);\
1621     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1622     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1623     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1624     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1625 }\
1626 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1627     uint8_t full[16*9];\
1628     uint8_t halfH[72];\
1629     copy_block9(full, src, 16, stride, 9);\
1630     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1631     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1632     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1633 }\
1634 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1635     uint8_t halfH[72];\
1636     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1637     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1638 }\
1639 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1640     OPNAME ## pixels16_c(dst, src, stride, 16);\
1641 }\
1642 \
1643 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1644     uint8_t half[256];\
1645     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1646     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1647 }\
1648 \
1649 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1650     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1651 }\
1652 \
1653 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1654     uint8_t half[256];\
1655     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1656     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1657 }\
1658 \
1659 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1660     uint8_t full[24*17];\
1661     uint8_t half[256];\
1662     copy_block17(full, src, 24, stride, 17);\
1663     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1664     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1665 }\
1666 \
1667 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1668     uint8_t full[24*17];\
1669     copy_block17(full, src, 24, stride, 17);\
1670     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1671 }\
1672 \
1673 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1674     uint8_t full[24*17];\
1675     uint8_t half[256];\
1676     copy_block17(full, src, 24, stride, 17);\
1677     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1678     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1679 }\
1680 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1681     uint8_t full[24*17];\
1682     uint8_t halfH[272];\
1683     uint8_t halfV[256];\
1684     uint8_t halfHV[256];\
1685     copy_block17(full, src, 24, stride, 17);\
1686     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1687     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1688     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1689     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1690 }\
1691 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1692     uint8_t full[24*17];\
1693     uint8_t halfH[272];\
1694     uint8_t halfHV[256];\
1695     copy_block17(full, src, 24, stride, 17);\
1696     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1697     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1698     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1699     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1700 }\
1701 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1702     uint8_t full[24*17];\
1703     uint8_t halfH[272];\
1704     uint8_t halfV[256];\
1705     uint8_t halfHV[256];\
1706     copy_block17(full, src, 24, stride, 17);\
1707     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1708     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1709     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1710     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1711 }\
1712 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1713     uint8_t full[24*17];\
1714     uint8_t halfH[272];\
1715     uint8_t halfHV[256];\
1716     copy_block17(full, src, 24, stride, 17);\
1717     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1718     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1719     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1720     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1721 }\
1722 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1723     uint8_t full[24*17];\
1724     uint8_t halfH[272];\
1725     uint8_t halfV[256];\
1726     uint8_t halfHV[256];\
1727     copy_block17(full, src, 24, stride, 17);\
1728     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1729     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1730     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1731     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1732 }\
1733 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1734     uint8_t full[24*17];\
1735     uint8_t halfH[272];\
1736     uint8_t halfHV[256];\
1737     copy_block17(full, src, 24, stride, 17);\
1738     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1739     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1740     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1741     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1742 }\
1743 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1744     uint8_t full[24*17];\
1745     uint8_t halfH[272];\
1746     uint8_t halfV[256];\
1747     uint8_t halfHV[256];\
1748     copy_block17(full, src, 24, stride, 17);\
1749     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1750     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1751     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1752     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1753 }\
1754 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1755     uint8_t full[24*17];\
1756     uint8_t halfH[272];\
1757     uint8_t halfHV[256];\
1758     copy_block17(full, src, 24, stride, 17);\
1759     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1760     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1761     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1762     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1763 }\
1764 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1765     uint8_t halfH[272];\
1766     uint8_t halfHV[256];\
1767     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1768     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1769     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1770 }\
1771 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1772     uint8_t halfH[272];\
1773     uint8_t halfHV[256];\
1774     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1775     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1776     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1777 }\
1778 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1779     uint8_t full[24*17];\
1780     uint8_t halfH[272];\
1781     uint8_t halfV[256];\
1782     uint8_t halfHV[256];\
1783     copy_block17(full, src, 24, stride, 17);\
1784     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1785     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1786     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1787     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1788 }\
1789 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1790     uint8_t full[24*17];\
1791     uint8_t halfH[272];\
1792     copy_block17(full, src, 24, stride, 17);\
1793     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1794     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1795     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1796 }\
1797 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1798     uint8_t full[24*17];\
1799     uint8_t halfH[272];\
1800     uint8_t halfV[256];\
1801     uint8_t halfHV[256];\
1802     copy_block17(full, src, 24, stride, 17);\
1803     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1804     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1805     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1806     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1807 }\
1808 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1809     uint8_t full[24*17];\
1810     uint8_t halfH[272];\
1811     copy_block17(full, src, 24, stride, 17);\
1812     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1813     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1814     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1815 }\
1816 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1817     uint8_t halfH[272];\
1818     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1819     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1820 }
1821
1822 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1823 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1824 #define op_put(a, b) a = cm[((b) + 16)>>5]
1825 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1826
1827 QPEL_MC(0, put_       , _       , op_put)
1828 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1829 QPEL_MC(0, avg_       , _       , op_avg)
1830 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1831 #undef op_avg
1832 #undef op_avg_no_rnd
1833 #undef op_put
1834 #undef op_put_no_rnd
1835
1836 #if 1
1837 #define H264_LOWPASS(OPNAME, OP, OP2) \
1838 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1839     const int h=4;\
1840     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1841     int i;\
1842     for(i=0; i<h; i++)\
1843     {\
1844         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1845         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1846         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1847         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1848         dst+=dstStride;\
1849         src+=srcStride;\
1850     }\
1851 }\
1852 \
1853 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1854     const int w=4;\
1855     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1856     int i;\
1857     for(i=0; i<w; i++)\
1858     {\
1859         const int srcB= src[-2*srcStride];\
1860         const int srcA= src[-1*srcStride];\
1861         const int src0= src[0 *srcStride];\
1862         const int src1= src[1 *srcStride];\
1863         const int src2= src[2 *srcStride];\
1864         const int src3= src[3 *srcStride];\
1865         const int src4= src[4 *srcStride];\
1866         const int src5= src[5 *srcStride];\
1867         const int src6= src[6 *srcStride];\
1868         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1869         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1870         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1871         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1872         dst++;\
1873         src++;\
1874     }\
1875 }\
1876 \
1877 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1878     const int h=4;\
1879     const int w=4;\
1880     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1881     int i;\
1882     src -= 2*srcStride;\
1883     for(i=0; i<h+5; i++)\
1884     {\
1885         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1886         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1887         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1888         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1889         tmp+=tmpStride;\
1890         src+=srcStride;\
1891     }\
1892     tmp -= tmpStride*(h+5-2);\
1893     for(i=0; i<w; i++)\
1894     {\
1895         const int tmpB= tmp[-2*tmpStride];\
1896         const int tmpA= tmp[-1*tmpStride];\
1897         const int tmp0= tmp[0 *tmpStride];\
1898         const int tmp1= tmp[1 *tmpStride];\
1899         const int tmp2= tmp[2 *tmpStride];\
1900         const int tmp3= tmp[3 *tmpStride];\
1901         const int tmp4= tmp[4 *tmpStride];\
1902         const int tmp5= tmp[5 *tmpStride];\
1903         const int tmp6= tmp[6 *tmpStride];\
1904         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1905         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1906         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1907         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1908         dst++;\
1909         tmp++;\
1910     }\
1911 }\
1912 \
1913 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1914     const int h=8;\
1915     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1916     int i;\
1917     for(i=0; i<h; i++)\
1918     {\
1919         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1920         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1921         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1922         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1923         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1924         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1925         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1926         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1927         dst+=dstStride;\
1928         src+=srcStride;\
1929     }\
1930 }\
1931 \
1932 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1933     const int w=8;\
1934     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1935     int i;\
1936     for(i=0; i<w; i++)\
1937     {\
1938         const int srcB= src[-2*srcStride];\
1939         const int srcA= src[-1*srcStride];\
1940         const int src0= src[0 *srcStride];\
1941         const int src1= src[1 *srcStride];\
1942         const int src2= src[2 *srcStride];\
1943         const int src3= src[3 *srcStride];\
1944         const int src4= src[4 *srcStride];\
1945         const int src5= src[5 *srcStride];\
1946         const int src6= src[6 *srcStride];\
1947         const int src7= src[7 *srcStride];\
1948         const int src8= src[8 *srcStride];\
1949         const int src9= src[9 *srcStride];\
1950         const int src10=src[10*srcStride];\
1951         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1952         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1953         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1954         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1955         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1956         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1957         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1958         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1959         dst++;\
1960         src++;\
1961     }\
1962 }\
1963 \
1964 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1965     const int h=8;\
1966     const int w=8;\
1967     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1968     int i;\
1969     src -= 2*srcStride;\
1970     for(i=0; i<h+5; i++)\
1971     {\
1972         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1973         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1974         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1975         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1976         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1977         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1978         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1979         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1980         tmp+=tmpStride;\
1981         src+=srcStride;\
1982     }\
1983     tmp -= tmpStride*(h+5-2);\
1984     for(i=0; i<w; i++)\
1985     {\
1986         const int tmpB= tmp[-2*tmpStride];\
1987         const int tmpA= tmp[-1*tmpStride];\
1988         const int tmp0= tmp[0 *tmpStride];\
1989         const int tmp1= tmp[1 *tmpStride];\
1990         const int tmp2= tmp[2 *tmpStride];\
1991         const int tmp3= tmp[3 *tmpStride];\
1992         const int tmp4= tmp[4 *tmpStride];\
1993         const int tmp5= tmp[5 *tmpStride];\
1994         const int tmp6= tmp[6 *tmpStride];\
1995         const int tmp7= tmp[7 *tmpStride];\
1996         const int tmp8= tmp[8 *tmpStride];\
1997         const int tmp9= tmp[9 *tmpStride];\
1998         const int tmp10=tmp[10*tmpStride];\
1999         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2000         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2001         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2002         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2003         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2004         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2005         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2006         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2007         dst++;\
2008         tmp++;\
2009     }\
2010 }\
2011 \
2012 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2013     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2014     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2015     src += 8*srcStride;\
2016     dst += 8*dstStride;\
2017     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2018     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2019 }\
2020 \
2021 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2022     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2023     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2024     src += 8*srcStride;\
2025     dst += 8*dstStride;\
2026     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2027     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2028 }\
2029 \
2030 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2031     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2032     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2033     src += 8*srcStride;\
2034     tmp += 8*tmpStride;\
2035     dst += 8*dstStride;\
2036     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2037     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2038 }\
2039
2040 #define H264_MC(OPNAME, SIZE) \
2041 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2042     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2043 }\
2044 \
2045 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2046     uint8_t half[SIZE*SIZE];\
2047     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2048     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2049 }\
2050 \
2051 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2052     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2053 }\
2054 \
2055 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2056     uint8_t half[SIZE*SIZE];\
2057     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2058     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2059 }\
2060 \
2061 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2062     uint8_t full[SIZE*(SIZE+5)];\
2063     uint8_t * const full_mid= full + SIZE*2;\
2064     uint8_t half[SIZE*SIZE];\
2065     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2066     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2067     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2068 }\
2069 \
2070 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2071     uint8_t full[SIZE*(SIZE+5)];\
2072     uint8_t * const full_mid= full + SIZE*2;\
2073     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2074     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2075 }\
2076 \
2077 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2078     uint8_t full[SIZE*(SIZE+5)];\
2079     uint8_t * const full_mid= full + SIZE*2;\
2080     uint8_t half[SIZE*SIZE];\
2081     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2082     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2083     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2084 }\
2085 \
2086 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2087     uint8_t full[SIZE*(SIZE+5)];\
2088     uint8_t * const full_mid= full + SIZE*2;\
2089     uint8_t halfH[SIZE*SIZE];\
2090     uint8_t halfV[SIZE*SIZE];\
2091     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2092     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2093     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2094     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2095 }\
2096 \
2097 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2098     uint8_t full[SIZE*(SIZE+5)];\
2099     uint8_t * const full_mid= full + SIZE*2;\
2100     uint8_t halfH[SIZE*SIZE];\
2101     uint8_t halfV[SIZE*SIZE];\
2102     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2103     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2104     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2105     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2106 }\
2107 \
2108 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2109     uint8_t full[SIZE*(SIZE+5)];\
2110     uint8_t * const full_mid= full + SIZE*2;\
2111     uint8_t halfH[SIZE*SIZE];\
2112     uint8_t halfV[SIZE*SIZE];\
2113     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2114     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2115     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2116     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2117 }\
2118 \
2119 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2120     uint8_t full[SIZE*(SIZE+5)];\
2121     uint8_t * const full_mid= full + SIZE*2;\
2122     uint8_t halfH[SIZE*SIZE];\
2123     uint8_t halfV[SIZE*SIZE];\
2124     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2125     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2126     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2127     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2128 }\
2129 \
2130 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2131     int16_t tmp[SIZE*(SIZE+5)];\
2132     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2133 }\
2134 \
2135 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2136     int16_t tmp[SIZE*(SIZE+5)];\
2137     uint8_t halfH[SIZE*SIZE];\
2138     uint8_t halfHV[SIZE*SIZE];\
2139     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2140     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2141     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2142 }\
2143 \
2144 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2145     int16_t tmp[SIZE*(SIZE+5)];\
2146     uint8_t halfH[SIZE*SIZE];\
2147     uint8_t halfHV[SIZE*SIZE];\
2148     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2149     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2150     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2151 }\
2152 \
2153 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2154     uint8_t full[SIZE*(SIZE+5)];\
2155     uint8_t * const full_mid= full + SIZE*2;\
2156     int16_t tmp[SIZE*(SIZE+5)];\
2157     uint8_t halfV[SIZE*SIZE];\
2158     uint8_t halfHV[SIZE*SIZE];\
2159     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2160     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2161     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2162     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2163 }\
2164 \
2165 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2166     uint8_t full[SIZE*(SIZE+5)];\
2167     uint8_t * const full_mid= full + SIZE*2;\
2168     int16_t tmp[SIZE*(SIZE+5)];\
2169     uint8_t halfV[SIZE*SIZE];\
2170     uint8_t halfHV[SIZE*SIZE];\
2171     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2172     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2173     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2174     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2175 }\
2176
2177 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2178 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2179 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2180 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2181 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2182
2183 H264_LOWPASS(put_       , op_put, op2_put)
2184 H264_LOWPASS(avg_       , op_avg, op2_avg)
2185 H264_MC(put_, 4)
2186 H264_MC(put_, 8)
2187 H264_MC(put_, 16)
2188 H264_MC(avg_, 4)
2189 H264_MC(avg_, 8)
2190 H264_MC(avg_, 16)
2191
2192 #undef op_avg
2193 #undef op_put
2194 #undef op2_avg
2195 #undef op2_put
2196 #endif
2197
2198 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2199     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2200     int i;
2201
2202     for(i=0; i<h; i++){
2203         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2204         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2205         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2206         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2207         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2208         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2209         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2210         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2211         dst+=dstStride;
2212         src+=srcStride;
2213     }
2214 }
2215
2216 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2217     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2218     int i;
2219
2220     for(i=0; i<w; i++){
2221         const int src_1= src[ -srcStride];
2222         const int src0 = src[0          ];
2223         const int src1 = src[  srcStride];
2224         const int src2 = src[2*srcStride];
2225         const int src3 = src[3*srcStride];
2226         const int src4 = src[4*srcStride];
2227         const int src5 = src[5*srcStride];
2228         const int src6 = src[6*srcStride];
2229         const int src7 = src[7*srcStride];
2230         const int src8 = src[8*srcStride];
2231         const int src9 = src[9*srcStride];
2232         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2233         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2234         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2235         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2236         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2237         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2238         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2239         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2240         src++;
2241         dst++;
2242     }
2243 }
2244
2245 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2246     put_pixels8_c(dst, src, stride, 8);
2247 }
2248
2249 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2250     uint8_t half[64];
2251     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2252     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2253 }
2254
2255 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2256     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2257 }
2258
2259 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2260     uint8_t half[64];
2261     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2262     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2263 }
2264
2265 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2266     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2267 }
2268
2269 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2270     uint8_t halfH[88];
2271     uint8_t halfV[64];
2272     uint8_t halfHV[64];
2273     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2274     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2275     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2276     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2277 }
2278 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2279     uint8_t halfH[88];
2280     uint8_t halfV[64];
2281     uint8_t halfHV[64];
2282     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2283     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2284     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2285     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2286 }
2287 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2288     uint8_t halfH[88];
2289     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2290     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2291 }
2292
2293 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2294     int x;
2295     const int strength= ff_h263_loop_filter_strength[qscale];
2296
2297     for(x=0; x<8; x++){
2298         int d1, d2, ad1;
2299         int p0= src[x-2*stride];
2300         int p1= src[x-1*stride];
2301         int p2= src[x+0*stride];
2302         int p3= src[x+1*stride];
2303         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2304
2305         if     (d<-2*strength) d1= 0;
2306         else if(d<-  strength) d1=-2*strength - d;
2307         else if(d<   strength) d1= d;
2308         else if(d< 2*strength) d1= 2*strength - d;
2309         else                   d1= 0;
2310
2311         p1 += d1;
2312         p2 -= d1;
2313         if(p1&256) p1= ~(p1>>31);
2314         if(p2&256) p2= ~(p2>>31);
2315
2316         src[x-1*stride] = p1;
2317         src[x+0*stride] = p2;
2318
2319         ad1= ABS(d1)>>1;
2320
2321         d2= clip((p0-p3)/4, -ad1, ad1);
2322
2323         src[x-2*stride] = p0 - d2;
2324         src[x+  stride] = p3 + d2;
2325     }
2326 }
2327
2328 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2329     int y;
2330     const int strength= ff_h263_loop_filter_strength[qscale];
2331
2332     for(y=0; y<8; y++){
2333         int d1, d2, ad1;
2334         int p0= src[y*stride-2];
2335         int p1= src[y*stride-1];
2336         int p2= src[y*stride+0];
2337         int p3= src[y*stride+1];
2338         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2339
2340         if     (d<-2*strength) d1= 0;
2341         else if(d<-  strength) d1=-2*strength - d;
2342         else if(d<   strength) d1= d;
2343         else if(d< 2*strength) d1= 2*strength - d;
2344         else                   d1= 0;
2345
2346         p1 += d1;
2347         p2 -= d1;
2348         if(p1&256) p1= ~(p1>>31);
2349         if(p2&256) p2= ~(p2>>31);
2350
2351         src[y*stride-1] = p1;
2352         src[y*stride+0] = p2;
2353
2354         ad1= ABS(d1)>>1;
2355
2356         d2= clip((p0-p3)/4, -ad1, ad1);
2357
2358         src[y*stride-2] = p0 - d2;
2359         src[y*stride+1] = p3 + d2;
2360     }
2361 }
2362
2363 static void h261_loop_filter_c(uint8_t *src, int stride){
2364     int x,y,xy,yz;
2365     int temp[64];
2366
2367     for(x=0; x<8; x++){
2368         temp[x      ] = 4*src[x           ];
2369         temp[x + 7*8] = 4*src[x + 7*stride];
2370     }
2371     for(y=1; y<7; y++){
2372         for(x=0; x<8; x++){
2373             xy = y * stride + x;
2374             yz = y * 8 + x;
2375             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2376         }
2377     }
2378
2379     for(y=0; y<8; y++){
2380         src[  y*stride] = (temp[  y*8] + 2)>>2;
2381         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2382         for(x=1; x<7; x++){
2383             xy = y * stride + x;
2384             yz = y * 8 + x;
2385             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2386         }
2387     }
2388 }
2389
2390 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2391 {
2392     int s, i;
2393
2394     s = 0;
2395     for(i=0;i<h;i++) {
2396         s += abs(pix1[0] - pix2[0]);
2397         s += abs(pix1[1] - pix2[1]);
2398         s += abs(pix1[2] - pix2[2]);
2399         s += abs(pix1[3] - pix2[3]);
2400         s += abs(pix1[4] - pix2[4]);
2401         s += abs(pix1[5] - pix2[5]);
2402         s += abs(pix1[6] - pix2[6]);
2403         s += abs(pix1[7] - pix2[7]);
2404         s += abs(pix1[8] - pix2[8]);
2405         s += abs(pix1[9] - pix2[9]);
2406         s += abs(pix1[10] - pix2[10]);
2407         s += abs(pix1[11] - pix2[11]);
2408         s += abs(pix1[12] - pix2[12]);
2409         s += abs(pix1[13] - pix2[13]);
2410         s += abs(pix1[14] - pix2[14]);
2411         s += abs(pix1[15] - pix2[15]);
2412         pix1 += line_size;
2413         pix2 += line_size;
2414     }
2415     return s;
2416 }
2417
2418 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2419 {
2420     int s, i;
2421
2422     s = 0;
2423     for(i=0;i<h;i++) {
2424         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2425         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2426         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2427         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2428         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2429         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2430         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2431         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2432         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2433         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2434         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2435         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2436         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2437         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2438         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2439         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2440         pix1 += line_size;
2441         pix2 += line_size;
2442     }
2443     return s;
2444 }
2445
2446 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2447 {
2448     int s, i;
2449     uint8_t *pix3 = pix2 + line_size;
2450
2451     s = 0;
2452     for(i=0;i<h;i++) {
2453         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2454         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2455         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2456         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2457         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2458         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2459         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2460         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2461         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2462         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2463         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2464         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2465         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2466         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2467         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2468         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2469         pix1 += line_size;
2470         pix2 += line_size;
2471         pix3 += line_size;
2472     }
2473     return s;
2474 }
2475
2476 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2477 {
2478     int s, i;
2479     uint8_t *pix3 = pix2 + line_size;
2480
2481     s = 0;
2482     for(i=0;i<h;i++) {
2483         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2484         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2485         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2486         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2487         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2488         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2489         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2490         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2491         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2492         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2493         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2494         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2495         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2496         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2497         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2498         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2499         pix1 += line_size;
2500         pix2 += line_size;
2501         pix3 += line_size;
2502     }
2503     return s;
2504 }
2505
2506 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2507 {
2508     int s, i;
2509
2510     s = 0;
2511     for(i=0;i<h;i++) {
2512         s += abs(pix1[0] - pix2[0]);
2513         s += abs(pix1[1] - pix2[1]);
2514         s += abs(pix1[2] - pix2[2]);
2515         s += abs(pix1[3] - pix2[3]);
2516         s += abs(pix1[4] - pix2[4]);
2517         s += abs(pix1[5] - pix2[5]);
2518         s += abs(pix1[6] - pix2[6]);
2519         s += abs(pix1[7] - pix2[7]);
2520         pix1 += line_size;
2521         pix2 += line_size;
2522     }
2523     return s;
2524 }
2525
2526 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2527 {
2528     int s, i;
2529
2530     s = 0;
2531     for(i=0;i<h;i++) {
2532         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2533         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2534         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2535         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2536         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2537         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2538         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2539         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2540         pix1 += line_size;
2541         pix2 += line_size;
2542     }
2543     return s;
2544 }
2545
2546 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2547 {
2548     int s, i;
2549     uint8_t *pix3 = pix2 + line_size;
2550
2551     s = 0;
2552     for(i=0;i<h;i++) {
2553         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2554         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2555         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2556         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2557         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2558         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2559         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2560         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2561         pix1 += line_size;
2562         pix2 += line_size;
2563         pix3 += line_size;
2564     }
2565     return s;
2566 }
2567
2568 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2569 {
2570     int s, i;
2571     uint8_t *pix3 = pix2 + line_size;
2572
2573     s = 0;
2574     for(i=0;i<h;i++) {
2575         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2576         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2577         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2578         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2579         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2580         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2581         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2582         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2583         pix1 += line_size;
2584         pix2 += line_size;
2585         pix3 += line_size;
2586     }
2587     return s;
2588 }
2589
2590 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2591     int score1=0;
2592     int score2=0;
2593     int x,y;
2594
2595     for(y=0; y<h; y++){
2596         for(x=0; x<16; x++){
2597             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2598         }
2599         if(y+1<h){
2600             for(x=0; x<15; x++){
2601                 score2+= ABS(  s1[x  ] - s1[x  +stride]
2602                              - s1[x+1] + s1[x+1+stride])
2603                         -ABS(  s2[x  ] - s2[x  +stride]
2604                              - s2[x+1] + s2[x+1+stride]);
2605             }
2606         }
2607         s1+= stride;
2608         s2+= stride;
2609     }
2610
2611     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2612     else  return score1 + ABS(score2)*8;
2613 }
2614
2615 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2616     int score1=0;
2617     int score2=0;
2618     int x,y;
2619
2620     for(y=0; y<h; y++){
2621         for(x=0; x<8; x++){
2622             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2623         }
2624         if(y+1<h){
2625             for(x=0; x<7; x++){
2626                 score2+= ABS(  s1[x  ] - s1[x  +stride]
2627                              - s1[x+1] + s1[x+1+stride])
2628                         -ABS(  s2[x  ] - s2[x  +stride]
2629                              - s2[x+1] + s2[x+1+stride]);
2630             }
2631         }
2632         s1+= stride;
2633         s2+= stride;
2634     }
2635
2636     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2637     else  return score1 + ABS(score2)*8;
2638 }
2639
2640 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2641     int i;
2642     unsigned int sum=0;
2643
2644     for(i=0; i<8*8; i++){
2645         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2646         int w= weight[i];
2647         b>>= RECON_SHIFT;
2648         assert(-512<b && b<512);
2649
2650         sum += (w*b)*(w*b)>>4;
2651     }
2652     return sum>>2;
2653 }
2654
2655 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2656     int i;
2657
2658     for(i=0; i<8*8; i++){
2659         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2660     }
2661 }
2662
2663 /**
2664  * permutes an 8x8 block.
2665  * @param block the block which will be permuted according to the given permutation vector
2666  * @param permutation the permutation vector
2667  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2668  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2669  *                  (inverse) permutated to scantable order!
2670  */
2671 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2672 {
2673     int i;
2674     DCTELEM temp[64];
2675
2676     if(last<=0) return;
2677     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2678
2679     for(i=0; i<=last; i++){
2680         const int j= scantable[i];
2681         temp[j]= block[j];
2682         block[j]=0;
2683     }
2684
2685     for(i=0; i<=last; i++){
2686         const int j= scantable[i];
2687         const int perm_j= permutation[j];
2688         block[perm_j]= temp[j];
2689     }
2690 }
2691
2692 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2693     return 0;
2694 }
2695
2696 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2697     int i;
2698
2699     memset(cmp, 0, sizeof(void*)*5);
2700
2701     for(i=0; i<5; i++){
2702         switch(type&0xFF){
2703         case FF_CMP_SAD:
2704             cmp[i]= c->sad[i];
2705             break;
2706         case FF_CMP_SATD:
2707             cmp[i]= c->hadamard8_diff[i];
2708             break;
2709         case FF_CMP_SSE:
2710             cmp[i]= c->sse[i];
2711             break;
2712         case FF_CMP_DCT:
2713             cmp[i]= c->dct_sad[i];
2714             break;
2715         case FF_CMP_PSNR:
2716             cmp[i]= c->quant_psnr[i];
2717             break;
2718         case FF_CMP_BIT:
2719             cmp[i]= c->bit[i];
2720             break;
2721         case FF_CMP_RD:
2722             cmp[i]= c->rd[i];
2723             break;
2724         case FF_CMP_VSAD:
2725             cmp[i]= c->vsad[i];
2726             break;
2727         case FF_CMP_VSSE:
2728             cmp[i]= c->vsse[i];
2729             break;
2730         case FF_CMP_ZERO:
2731             cmp[i]= zero_cmp;
2732             break;
2733         case FF_CMP_NSSE:
2734             cmp[i]= c->nsse[i];
2735             break;
2736         default:
2737             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2738         }
2739     }
2740 }
2741
2742 /**
2743  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2744  */
2745 static void clear_blocks_c(DCTELEM *blocks)
2746 {
2747     memset(blocks, 0, sizeof(DCTELEM)*6*64);
2748 }
2749
2750 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2751     int i;
2752     for(i=0; i+7<w; i+=8){
2753         dst[i+0] += src[i+0];
2754         dst[i+1] += src[i+1];
2755         dst[i+2] += src[i+2];
2756         dst[i+3] += src[i+3];
2757         dst[i+4] += src[i+4];
2758         dst[i+5] += src[i+5];
2759         dst[i+6] += src[i+6];
2760         dst[i+7] += src[i+7];
2761     }
2762     for(; i<w; i++)
2763         dst[i+0] += src[i+0];
2764 }
2765
2766 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2767     int i;
2768     for(i=0; i+7<w; i+=8){
2769         dst[i+0] = src1[i+0]-src2[i+0];
2770         dst[i+1] = src1[i+1]-src2[i+1];
2771         dst[i+2] = src1[i+2]-src2[i+2];
2772         dst[i+3] = src1[i+3]-src2[i+3];
2773         dst[i+4] = src1[i+4]-src2[i+4];
2774         dst[i+5] = src1[i+5]-src2[i+5];
2775         dst[i+6] = src1[i+6]-src2[i+6];
2776         dst[i+7] = src1[i+7]-src2[i+7];
2777     }
2778     for(; i<w; i++)
2779         dst[i+0] = src1[i+0]-src2[i+0];
2780 }
2781
2782 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2783     int i;
2784     uint8_t l, lt;
2785
2786     l= *left;
2787     lt= *left_top;
2788
2789     for(i=0; i<w; i++){
2790         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2791         lt= src1[i];
2792         l= src2[i];
2793         dst[i]= l - pred;
2794     }
2795
2796     *left= l;
2797     *left_top= lt;
2798 }
2799
2800 #define BUTTERFLY2(o1,o2,i1,i2) \
2801 o1= (i1)+(i2);\
2802 o2= (i1)-(i2);
2803
2804 #define BUTTERFLY1(x,y) \
2805 {\
2806     int a,b;\
2807     a= x;\
2808     b= y;\
2809     x= a+b;\
2810     y= a-b;\
2811 }
2812
2813 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2814
2815 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2816     int i;
2817     int temp[64];
2818     int sum=0;
2819
2820     assert(h==8);
2821
2822     for(i=0; i<8; i++){
2823         //FIXME try pointer walks
2824         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2825         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2826         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2827         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2828
2829         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2830         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2831         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2832         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2833
2834         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2835         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2836         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2837         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2838     }
2839
2840     for(i=0; i<8; i++){
2841         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2842         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2843         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2844         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2845
2846         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2847         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2848         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2849         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2850
2851         sum +=
2852              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2853             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2854             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2855             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2856     }
2857 #if 0
2858 static int maxi=0;
2859 if(sum>maxi){
2860     maxi=sum;
2861     printf("MAX:%d\n", maxi);
2862 }
2863 #endif
2864     return sum;
2865 }
2866
2867 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2868     int i;
2869     int temp[64];
2870     int sum=0;
2871
2872     assert(h==8);
2873
2874     for(i=0; i<8; i++){
2875         //FIXME try pointer walks
2876         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2877         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2878         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2879         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2880
2881         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2882         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2883         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2884         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2885
2886         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2887         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2888         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2889         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2890     }
2891
2892     for(i=0; i<8; i++){
2893         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2894         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2895         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2896         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2897
2898         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2899         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2900         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2901         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2902
2903         sum +=
2904              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2905             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2906             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2907             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2908     }
2909
2910     sum -= ABS(temp[8*0] + temp[8*4]); // -mean
2911
2912     return sum;
2913 }
2914
2915 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2916     MpegEncContext * const s= (MpegEncContext *)c;
2917     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2918     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2919     int sum=0, i;
2920
2921     assert(h==8);
2922
2923     s->dsp.diff_pixels(temp, src1, src2, stride);
2924     s->dsp.fdct(temp);
2925
2926     for(i=0; i<64; i++)
2927         sum+= ABS(temp[i]);
2928
2929     return sum;
2930 }
2931
2932 void simple_idct(DCTELEM *block); //FIXME
2933
2934 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2935     MpegEncContext * const s= (MpegEncContext *)c;
2936     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2937     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2938     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2939     int sum=0, i;
2940
2941     assert(h==8);
2942     s->mb_intra=0;
2943
2944     s->dsp.diff_pixels(temp, src1, src2, stride);
2945
2946     memcpy(bak, temp, 64*sizeof(DCTELEM));
2947
2948     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2949     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2950     simple_idct(temp); //FIXME
2951
2952     for(i=0; i<64; i++)
2953         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2954
2955     return sum;
2956 }
2957
2958 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2959     MpegEncContext * const s= (MpegEncContext *)c;
2960     const uint8_t *scantable= s->intra_scantable.permutated;
2961     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2962     uint64_t __align8 aligned_bak[stride];
2963     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2964     uint8_t * const bak= (uint8_t*)aligned_bak;
2965     int i, last, run, bits, level, distoration, start_i;
2966     const int esc_length= s->ac_esc_length;
2967     uint8_t * length;
2968     uint8_t * last_length;
2969
2970     assert(h==8);
2971
2972     for(i=0; i<8; i++){
2973         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2974         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2975     }
2976
2977     s->dsp.diff_pixels(temp, src1, src2, stride);
2978
2979     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2980
2981     bits=0;
2982
2983     if (s->mb_intra) {
2984         start_i = 1;
2985         length     = s->intra_ac_vlc_length;
2986         last_length= s->intra_ac_vlc_last_length;
2987         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2988     } else {
2989         start_i = 0;
2990         length     = s->inter_ac_vlc_length;
2991         last_length= s->inter_ac_vlc_last_length;
2992     }
2993
2994     if(last>=start_i){
2995         run=0;
2996         for(i=start_i; i<last; i++){
2997             int j= scantable[i];
2998             level= temp[j];
2999
3000             if(level){
3001                 level+=64;
3002                 if((level&(~127)) == 0){
3003                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3004                 }else
3005                     bits+= esc_length;
3006                 run=0;
3007             }else
3008                 run++;
3009         }
3010         i= scantable[last];
3011
3012         level= temp[i] + 64;
3013
3014         assert(level - 64);
3015
3016         if((level&(~127)) == 0){
3017             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3018         }else
3019             bits+= esc_length;
3020
3021     }
3022
3023     if(last>=0){
3024         if(s->mb_intra)
3025             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3026         else
3027             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3028     }
3029
3030     s->dsp.idct_add(bak, stride, temp);
3031
3032     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3033
3034     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3035 }
3036
3037 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3038     MpegEncContext * const s= (MpegEncContext *)c;
3039     const uint8_t *scantable= s->intra_scantable.permutated;
3040     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3041     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3042     int i, last, run, bits, level, start_i;
3043     const int esc_length= s->ac_esc_length;
3044     uint8_t * length;
3045     uint8_t * last_length;
3046
3047     assert(h==8);
3048
3049     s->dsp.diff_pixels(temp, src1, src2, stride);
3050
3051     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3052
3053     bits=0;
3054
3055     if (s->mb_intra) {
3056         start_i = 1;
3057         length     = s->intra_ac_vlc_length;
3058         last_length= s->intra_ac_vlc_last_length;
3059         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3060     } else {
3061         start_i = 0;
3062         length     = s->inter_ac_vlc_length;
3063         last_length= s->inter_ac_vlc_last_length;
3064     }
3065
3066     if(last>=start_i){
3067         run=0;
3068         for(i=start_i; i<last; i++){
3069             int j= scantable[i];
3070             level= temp[j];
3071
3072             if(level){
3073                 level+=64;
3074                 if((level&(~127)) == 0){
3075                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3076                 }else
3077                     bits+= esc_length;
3078                 run=0;
3079             }else
3080                 run++;
3081         }
3082         i= scantable[last];
3083
3084         level= temp[i] + 64;
3085
3086         assert(level - 64);
3087
3088         if((level&(~127)) == 0){
3089             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3090         }else
3091             bits+= esc_length;
3092     }
3093
3094     return bits;
3095 }
3096
3097 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3098     int score=0;
3099     int x,y;
3100
3101     for(y=1; y<h; y++){
3102         for(x=0; x<16; x+=4){
3103             score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride])
3104                    +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3105         }
3106         s+= stride;
3107     }
3108
3109     return score;
3110 }
3111
3112 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3113     int score=0;
3114     int x,y;
3115
3116     for(y=1; y<h; y++){
3117         for(x=0; x<16; x++){
3118             score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3119         }
3120         s1+= stride;
3121         s2+= stride;
3122     }
3123
3124     return score;
3125 }
3126
3127 #define SQ(a) ((a)*(a))
3128 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3129     int score=0;
3130     int x,y;
3131
3132     for(y=1; y<h; y++){
3133         for(x=0; x<16; x+=4){
3134             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3135                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3136         }
3137         s+= stride;
3138     }
3139
3140     return score;
3141 }
3142
3143 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3144     int score=0;
3145     int x,y;
3146
3147     for(y=1; y<h; y++){
3148         for(x=0; x<16; x++){
3149             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3150         }
3151         s1+= stride;
3152         s2+= stride;
3153     }
3154
3155     return score;
3156 }
3157
3158 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3159 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3160 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3161 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3162 WARPER8_16_SQ(rd8x8_c, rd16_c)
3163 WARPER8_16_SQ(bit8x8_c, bit16_c)
3164
3165 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3166  converted */
3167 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3168 {
3169     j_rev_dct (block);
3170     put_pixels_clamped_c(block, dest, line_size);
3171 }
3172 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3173 {
3174     j_rev_dct (block);
3175     add_pixels_clamped_c(block, dest, line_size);
3176 }
3177
3178 /* init static data */
3179 void dsputil_static_init(void)
3180 {
3181     int i;
3182
3183     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3184     for(i=0;i<MAX_NEG_CROP;i++) {
3185         cropTbl[i] = 0;
3186         cropTbl[i + MAX_NEG_CROP + 256] = 255;
3187     }
3188
3189     for(i=0;i<512;i++) {
3190         squareTbl[i] = (i - 256) * (i - 256);
3191     }
3192
3193     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3194 }
3195
3196
3197 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3198 {
3199     int i;
3200
3201 #ifdef CONFIG_ENCODERS
3202     if(avctx->dct_algo==FF_DCT_FASTINT) {
3203         c->fdct = fdct_ifast;
3204         c->fdct248 = fdct_ifast248;
3205     }
3206     else if(avctx->dct_algo==FF_DCT_FAAN) {
3207         c->fdct = ff_faandct;
3208         c->fdct248 = ff_faandct248;
3209     }
3210     else {
3211         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3212         c->fdct248 = ff_fdct248_islow;
3213     }
3214 #endif //CONFIG_ENCODERS
3215
3216     if(avctx->idct_algo==FF_IDCT_INT){
3217         c->idct_put= ff_jref_idct_put;
3218         c->idct_add= ff_jref_idct_add;
3219         c->idct    = j_rev_dct;
3220         c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3221     }else{ //accurate/default
3222         c->idct_put= simple_idct_put;
3223         c->idct_add= simple_idct_add;
3224         c->idct    = simple_idct;
3225         c->idct_permutation_type= FF_NO_IDCT_PERM;
3226     }
3227
3228     /* VP3 DSP support */
3229     c->vp3_dsp_init = vp3_dsp_init_c;
3230     c->vp3_idct = vp3_idct_c;
3231
3232     c->get_pixels = get_pixels_c;
3233     c->diff_pixels = diff_pixels_c;
3234     c->put_pixels_clamped = put_pixels_clamped_c;
3235     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3236     c->add_pixels_clamped = add_pixels_clamped_c;
3237     c->gmc1 = gmc1_c;
3238     c->gmc = gmc_c;
3239     c->clear_blocks = clear_blocks_c;
3240     c->pix_sum = pix_sum_c;
3241     c->pix_norm1 = pix_norm1_c;
3242
3243     /* TODO [0] 16  [1] 8 */
3244     c->pix_abs[0][0] = pix_abs16_c;
3245     c->pix_abs[0][1] = pix_abs16_x2_c;
3246     c->pix_abs[0][2] = pix_abs16_y2_c;
3247     c->pix_abs[0][3] = pix_abs16_xy2_c;
3248     c->pix_abs[1][0] = pix_abs8_c;
3249     c->pix_abs[1][1] = pix_abs8_x2_c;
3250     c->pix_abs[1][2] = pix_abs8_y2_c;
3251     c->pix_abs[1][3] = pix_abs8_xy2_c;
3252
3253 #define dspfunc(PFX, IDX, NUM) \
3254     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3255     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3256     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3257     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3258
3259     dspfunc(put, 0, 16);
3260     dspfunc(put_no_rnd, 0, 16);
3261     dspfunc(put, 1, 8);
3262     dspfunc(put_no_rnd, 1, 8);
3263     dspfunc(put, 2, 4);
3264     dspfunc(put, 3, 2);
3265
3266     dspfunc(avg, 0, 16);
3267     dspfunc(avg_no_rnd, 0, 16);
3268     dspfunc(avg, 1, 8);
3269     dspfunc(avg_no_rnd, 1, 8);
3270     dspfunc(avg, 2, 4);
3271     dspfunc(avg, 3, 2);
3272 #undef dspfunc
3273
3274     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3275     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3276
3277     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3278     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3279     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3280     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3281     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3282     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3283     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3284     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3285     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3286
3287     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3288     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3289     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3290     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3291     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3292     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3293     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3294     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3295     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3296
3297 #define dspfunc(PFX, IDX, NUM) \
3298     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3299     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3300     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3301     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3302     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3303     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3304     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3305     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3306     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3307     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3308     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3309     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3310     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3311     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3312     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3313     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3314
3315     dspfunc(put_qpel, 0, 16);
3316     dspfunc(put_no_rnd_qpel, 0, 16);
3317
3318     dspfunc(avg_qpel, 0, 16);
3319     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3320
3321     dspfunc(put_qpel, 1, 8);
3322     dspfunc(put_no_rnd_qpel, 1, 8);
3323
3324     dspfunc(avg_qpel, 1, 8);
3325     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3326
3327     dspfunc(put_h264_qpel, 0, 16);
3328     dspfunc(put_h264_qpel, 1, 8);
3329     dspfunc(put_h264_qpel, 2, 4);
3330     dspfunc(avg_h264_qpel, 0, 16);
3331     dspfunc(avg_h264_qpel, 1, 8);
3332     dspfunc(avg_h264_qpel, 2, 4);
3333
3334 #undef dspfunc
3335     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3336     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3337     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3338     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3339     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3340     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3341
3342     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3343     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3344     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3345     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3346     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3347     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3348     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3349     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3350
3351 #define SET_CMP_FUNC(name) \
3352     c->name[0]= name ## 16_c;\
3353     c->name[1]= name ## 8x8_c;
3354
3355     SET_CMP_FUNC(hadamard8_diff)
3356     c->hadamard8_diff[4]= hadamard8_intra16_c;
3357     SET_CMP_FUNC(dct_sad)
3358     c->sad[0]= pix_abs16_c;
3359     c->sad[1]= pix_abs8_c;
3360     c->sse[0]= sse16_c;
3361     c->sse[1]= sse8_c;
3362     SET_CMP_FUNC(quant_psnr)
3363     SET_CMP_FUNC(rd)
3364     SET_CMP_FUNC(bit)
3365     c->vsad[0]= vsad16_c;
3366     c->vsad[4]= vsad_intra16_c;
3367     c->vsse[0]= vsse16_c;
3368     c->vsse[4]= vsse_intra16_c;
3369     c->nsse[0]= nsse16_c;
3370     c->nsse[1]= nsse8_c;
3371
3372     c->add_bytes= add_bytes_c;
3373     c->diff_bytes= diff_bytes_c;
3374     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3375     c->bswap_buf= bswap_buf;
3376
3377     c->h263_h_loop_filter= h263_h_loop_filter_c;
3378     c->h263_v_loop_filter= h263_v_loop_filter_c;
3379
3380     c->h261_loop_filter= h261_loop_filter_c;
3381
3382     c->try_8x8basis= try_8x8basis_c;
3383     c->add_8x8basis= add_8x8basis_c;
3384
3385 #ifdef HAVE_MMX
3386     dsputil_init_mmx(c, avctx);
3387 #endif
3388 #ifdef ARCH_ARMV4L
3389     dsputil_init_armv4l(c, avctx);
3390 #endif
3391 #ifdef HAVE_MLIB
3392     dsputil_init_mlib(c, avctx);
3393 #endif
3394 #ifdef ARCH_SPARC
3395    dsputil_init_vis(c,avctx);
3396 #endif
3397 #ifdef ARCH_ALPHA
3398     dsputil_init_alpha(c, avctx);
3399 #endif
3400 #ifdef ARCH_POWERPC
3401     dsputil_init_ppc(c, avctx);
3402 #endif
3403 #ifdef HAVE_MMI
3404     dsputil_init_mmi(c, avctx);
3405 #endif
3406 #ifdef ARCH_SH4
3407     dsputil_init_sh4(c,avctx);
3408 #endif
3409
3410     switch(c->idct_permutation_type){
3411     case FF_NO_IDCT_PERM:
3412         for(i=0; i<64; i++)
3413             c->idct_permutation[i]= i;
3414         break;
3415     case FF_LIBMPEG2_IDCT_PERM:
3416         for(i=0; i<64; i++)
3417             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3418         break;
3419     case FF_SIMPLE_IDCT_PERM:
3420         for(i=0; i<64; i++)
3421             c->idct_permutation[i]= simple_mmx_permutation[i];
3422         break;
3423     case FF_TRANSPOSE_IDCT_PERM:
3424         for(i=0; i<64; i++)
3425             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3426         break;
3427     default:
3428         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3429     }
3430 }
3431