git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18  *
  19  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  20  */
  21
  22 /**
  23  * @file dsputil.c
  24  * DSP utils
  25  */
  26
  27 #include "avcodec.h"
  28 #include "dsputil.h"
  29 #include "mpegvideo.h"
  30 #include "simple_idct.h"
  31 #include "faandct.h"
  32
  33 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
  34 uint32_t squareTbl[512];
  35
  36 const uint8_t ff_zigzag_direct[64] = {
  37     0,   1,  8, 16,  9,  2,  3, 10,
  38     17, 24, 32, 25, 18, 11,  4,  5,
  39     12, 19, 26, 33, 40, 48, 41, 34,
  40     27, 20, 13,  6,  7, 14, 21, 28,
  41     35, 42, 49, 56, 57, 50, 43, 36,
  42     29, 22, 15, 23, 30, 37, 44, 51,
  43     58, 59, 52, 45, 38, 31, 39, 46,
  44     53, 60, 61, 54, 47, 55, 62, 63
  45 };
  46
  47 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  48    specification, we interleave the fields */
  49 const uint8_t ff_zigzag248_direct[64] = {
  50      0,  8,  1,  9, 16, 24,  2, 10,
  51     17, 25, 32, 40, 48, 56, 33, 41,
  52     18, 26,  3, 11,  4, 12, 19, 27,
  53     34, 42, 49, 57, 50, 58, 35, 43,
  54     20, 28,  5, 13,  6, 14, 21, 29,
  55     36, 44, 51, 59, 52, 60, 37, 45,
  56     22, 30,  7, 15, 23, 31, 38, 46,
  57     53, 61, 54, 62, 39, 47, 55, 63,
  58 };
  59
  60 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  61 uint16_t __align8 inv_zigzag_direct16[64];
  62
  63 const uint8_t ff_alternate_horizontal_scan[64] = {
  64     0,  1,   2,  3,  8,  9, 16, 17,
  65     10, 11,  4,  5,  6,  7, 15, 14,
  66     13, 12, 19, 18, 24, 25, 32, 33,
  67     26, 27, 20, 21, 22, 23, 28, 29,
  68     30, 31, 34, 35, 40, 41, 48, 49,
  69     42, 43, 36, 37, 38, 39, 44, 45,
  70     46, 47, 50, 51, 56, 57, 58, 59,
  71     52, 53, 54, 55, 60, 61, 62, 63,
  72 };
  73
  74 const uint8_t ff_alternate_vertical_scan[64] = {
  75     0,  8,  16, 24,  1,  9,  2, 10,
  76     17, 25, 32, 40, 48, 56, 57, 49,
  77     41, 33, 26, 18,  3, 11,  4, 12,
  78     19, 27, 34, 42, 50, 58, 35, 43,
  79     51, 59, 20, 28,  5, 13,  6, 14,
  80     21, 29, 36, 44, 52, 60, 37, 45,
  81     53, 61, 22, 30,  7, 15, 23, 31,
  82     38, 46, 54, 62, 39, 47, 55, 63,
  83 };
  84
  85 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  86 const uint32_t inverse[256]={
  87          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  88  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
  89  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
  90  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
  91  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
  92  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
  93   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
  94   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
  95   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
  96   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
  97   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
  98   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
  99   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 100   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 101   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 102   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 103   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 104   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 105   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 106   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 107   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 108   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 109   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 110   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 111   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 112   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 113   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 114   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 115   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 116   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 117   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 118   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 119 };
 120
 121 /* Input permutation for the simple_idct_mmx */
 122 static const uint8_t simple_mmx_permutation[64]={
 123         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 124         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 125         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 126         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 127         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 128         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 129         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 130         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 131 };
 132
 133 static int pix_sum_c(uint8_t * pix, int line_size)
 134 {
 135     int s, i, j;
 136
 137     s = 0;
 138     for (i = 0; i < 16; i++) {
 139         for (j = 0; j < 16; j += 8) {
 140             s += pix[0];
 141             s += pix[1];
 142             s += pix[2];
 143             s += pix[3];
 144             s += pix[4];
 145             s += pix[5];
 146             s += pix[6];
 147             s += pix[7];
 148             pix += 8;
 149         }
 150         pix += line_size - 16;
 151     }
 152     return s;
 153 }
 154
 155 static int pix_norm1_c(uint8_t * pix, int line_size)
 156 {
 157     int s, i, j;
 158     uint32_t *sq = squareTbl + 256;
 159
 160     s = 0;
 161     for (i = 0; i < 16; i++) {
 162         for (j = 0; j < 16; j += 8) {
 163 #if 0
 164             s += sq[pix[0]];
 165             s += sq[pix[1]];
 166             s += sq[pix[2]];
 167             s += sq[pix[3]];
 168             s += sq[pix[4]];
 169             s += sq[pix[5]];
 170             s += sq[pix[6]];
 171             s += sq[pix[7]];
 172 #else
 173 #if LONG_MAX > 2147483647
 174             register uint64_t x=*(uint64_t*)pix;
 175             s += sq[x&0xff];
 176             s += sq[(x>>8)&0xff];
 177             s += sq[(x>>16)&0xff];
 178             s += sq[(x>>24)&0xff];
 179             s += sq[(x>>32)&0xff];
 180             s += sq[(x>>40)&0xff];
 181             s += sq[(x>>48)&0xff];
 182             s += sq[(x>>56)&0xff];
 183 #else
 184             register uint32_t x=*(uint32_t*)pix;
 185             s += sq[x&0xff];
 186             s += sq[(x>>8)&0xff];
 187             s += sq[(x>>16)&0xff];
 188             s += sq[(x>>24)&0xff];
 189             x=*(uint32_t*)(pix+4);
 190             s += sq[x&0xff];
 191             s += sq[(x>>8)&0xff];
 192             s += sq[(x>>16)&0xff];
 193             s += sq[(x>>24)&0xff];
 194 #endif
 195 #endif
 196             pix += 8;
 197         }
 198         pix += line_size - 16;
 199     }
 200     return s;
 201 }
 202
 203 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 204     int i;
 205
 206     for(i=0; i+8<=w; i+=8){
 207         dst[i+0]= bswap_32(src[i+0]);
 208         dst[i+1]= bswap_32(src[i+1]);
 209         dst[i+2]= bswap_32(src[i+2]);
 210         dst[i+3]= bswap_32(src[i+3]);
 211         dst[i+4]= bswap_32(src[i+4]);
 212         dst[i+5]= bswap_32(src[i+5]);
 213         dst[i+6]= bswap_32(src[i+6]);
 214         dst[i+7]= bswap_32(src[i+7]);
 215     }
 216     for(;i<w; i++){
 217         dst[i+0]= bswap_32(src[i+0]);
 218     }
 219 }
 220
 221 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 222 {
 223     int s, i;
 224     uint32_t *sq = squareTbl + 256;
 225
 226     s = 0;
 227     for (i = 0; i < h; i++) {
 228         s += sq[pix1[0] - pix2[0]];
 229         s += sq[pix1[1] - pix2[1]];
 230         s += sq[pix1[2] - pix2[2]];
 231         s += sq[pix1[3] - pix2[3]];
 232         s += sq[pix1[4] - pix2[4]];
 233         s += sq[pix1[5] - pix2[5]];
 234         s += sq[pix1[6] - pix2[6]];
 235         s += sq[pix1[7] - pix2[7]];
 236         pix1 += line_size;
 237         pix2 += line_size;
 238     }
 239     return s;
 240 }
 241
 242 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 243 {
 244     int s, i;
 245     uint32_t *sq = squareTbl + 256;
 246
 247     s = 0;
 248     for (i = 0; i < h; i++) {
 249         s += sq[pix1[ 0] - pix2[ 0]];
 250         s += sq[pix1[ 1] - pix2[ 1]];
 251         s += sq[pix1[ 2] - pix2[ 2]];
 252         s += sq[pix1[ 3] - pix2[ 3]];
 253         s += sq[pix1[ 4] - pix2[ 4]];
 254         s += sq[pix1[ 5] - pix2[ 5]];
 255         s += sq[pix1[ 6] - pix2[ 6]];
 256         s += sq[pix1[ 7] - pix2[ 7]];
 257         s += sq[pix1[ 8] - pix2[ 8]];
 258         s += sq[pix1[ 9] - pix2[ 9]];
 259         s += sq[pix1[10] - pix2[10]];
 260         s += sq[pix1[11] - pix2[11]];
 261         s += sq[pix1[12] - pix2[12]];
 262         s += sq[pix1[13] - pix2[13]];
 263         s += sq[pix1[14] - pix2[14]];
 264         s += sq[pix1[15] - pix2[15]];
 265
 266         pix1 += line_size;
 267         pix2 += line_size;
 268     }
 269     return s;
 270 }
 271
 272 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 273 {
 274     int i;
 275
 276     /* read the pixels */
 277     for(i=0;i<8;i++) {
 278         block[0] = pixels[0];
 279         block[1] = pixels[1];
 280         block[2] = pixels[2];
 281         block[3] = pixels[3];
 282         block[4] = pixels[4];
 283         block[5] = pixels[5];
 284         block[6] = pixels[6];
 285         block[7] = pixels[7];
 286         pixels += line_size;
 287         block += 8;
 288     }
 289 }
 290
 291 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 292                           const uint8_t *s2, int stride){
 293     int i;
 294
 295     /* read the pixels */
 296     for(i=0;i<8;i++) {
 297         block[0] = s1[0] - s2[0];
 298         block[1] = s1[1] - s2[1];
 299         block[2] = s1[2] - s2[2];
 300         block[3] = s1[3] - s2[3];
 301         block[4] = s1[4] - s2[4];
 302         block[5] = s1[5] - s2[5];
 303         block[6] = s1[6] - s2[6];
 304         block[7] = s1[7] - s2[7];
 305         s1 += stride;
 306         s2 += stride;
 307         block += 8;
 308     }
 309 }
 310
 311
 312 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 313                                  int line_size)
 314 {
 315     int i;
 316     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 317
 318     /* read the pixels */
 319     for(i=0;i<8;i++) {
 320         pixels[0] = cm[block[0]];
 321         pixels[1] = cm[block[1]];
 322         pixels[2] = cm[block[2]];
 323         pixels[3] = cm[block[3]];
 324         pixels[4] = cm[block[4]];
 325         pixels[5] = cm[block[5]];
 326         pixels[6] = cm[block[6]];
 327         pixels[7] = cm[block[7]];
 328
 329         pixels += line_size;
 330         block += 8;
 331     }
 332 }
 333
 334 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 335                           int line_size)
 336 {
 337     int i;
 338     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 339
 340     /* read the pixels */
 341     for(i=0;i<8;i++) {
 342         pixels[0] = cm[pixels[0] + block[0]];
 343         pixels[1] = cm[pixels[1] + block[1]];
 344         pixels[2] = cm[pixels[2] + block[2]];
 345         pixels[3] = cm[pixels[3] + block[3]];
 346         pixels[4] = cm[pixels[4] + block[4]];
 347         pixels[5] = cm[pixels[5] + block[5]];
 348         pixels[6] = cm[pixels[6] + block[6]];
 349         pixels[7] = cm[pixels[7] + block[7]];
 350         pixels += line_size;
 351         block += 8;
 352     }
 353 }
 354 #if 0
 355
 356 #define PIXOP2(OPNAME, OP) \
 357 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 358 {\
 359     int i;\
 360     for(i=0; i<h; i++){\
 361         OP(*((uint64_t*)block), LD64(pixels));\
 362         pixels+=line_size;\
 363         block +=line_size;\
 364     }\
 365 }\
 366 \
 367 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 368 {\
 369     int i;\
 370     for(i=0; i<h; i++){\
 371         const uint64_t a= LD64(pixels  );\
 372         const uint64_t b= LD64(pixels+1);\
 373         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 374         pixels+=line_size;\
 375         block +=line_size;\
 376     }\
 377 }\
 378 \
 379 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 380 {\
 381     int i;\
 382     for(i=0; i<h; i++){\
 383         const uint64_t a= LD64(pixels  );\
 384         const uint64_t b= LD64(pixels+1);\
 385         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 386         pixels+=line_size;\
 387         block +=line_size;\
 388     }\
 389 }\
 390 \
 391 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 392 {\
 393     int i;\
 394     for(i=0; i<h; i++){\
 395         const uint64_t a= LD64(pixels          );\
 396         const uint64_t b= LD64(pixels+line_size);\
 397         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 398         pixels+=line_size;\
 399         block +=line_size;\
 400     }\
 401 }\
 402 \
 403 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 404 {\
 405     int i;\
 406     for(i=0; i<h; i++){\
 407         const uint64_t a= LD64(pixels          );\
 408         const uint64_t b= LD64(pixels+line_size);\
 409         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 410         pixels+=line_size;\
 411         block +=line_size;\
 412     }\
 413 }\
 414 \
 415 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 416 {\
 417         int i;\
 418         const uint64_t a= LD64(pixels  );\
 419         const uint64_t b= LD64(pixels+1);\
 420         uint64_t l0=  (a&0x0303030303030303ULL)\
 421                     + (b&0x0303030303030303ULL)\
 422                     + 0x0202020202020202ULL;\
 423         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 424                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 425         uint64_t l1,h1;\
 426 \
 427         pixels+=line_size;\
 428         for(i=0; i<h; i+=2){\
 429             uint64_t a= LD64(pixels  );\
 430             uint64_t b= LD64(pixels+1);\
 431             l1=  (a&0x0303030303030303ULL)\
 432                + (b&0x0303030303030303ULL);\
 433             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 434               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 435             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 436             pixels+=line_size;\
 437             block +=line_size;\
 438             a= LD64(pixels  );\
 439             b= LD64(pixels+1);\
 440             l0=  (a&0x0303030303030303ULL)\
 441                + (b&0x0303030303030303ULL)\
 442                + 0x0202020202020202ULL;\
 443             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 444               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 445             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 446             pixels+=line_size;\
 447             block +=line_size;\
 448         }\
 449 }\
 450 \
 451 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 452 {\
 453         int i;\
 454         const uint64_t a= LD64(pixels  );\
 455         const uint64_t b= LD64(pixels+1);\
 456         uint64_t l0=  (a&0x0303030303030303ULL)\
 457                     + (b&0x0303030303030303ULL)\
 458                     + 0x0101010101010101ULL;\
 459         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 460                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 461         uint64_t l1,h1;\
 462 \
 463         pixels+=line_size;\
 464         for(i=0; i<h; i+=2){\
 465             uint64_t a= LD64(pixels  );\
 466             uint64_t b= LD64(pixels+1);\
 467             l1=  (a&0x0303030303030303ULL)\
 468                + (b&0x0303030303030303ULL);\
 469             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 470               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 471             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 472             pixels+=line_size;\
 473             block +=line_size;\
 474             a= LD64(pixels  );\
 475             b= LD64(pixels+1);\
 476             l0=  (a&0x0303030303030303ULL)\
 477                + (b&0x0303030303030303ULL)\
 478                + 0x0101010101010101ULL;\
 479             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 480               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 481             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 482             pixels+=line_size;\
 483             block +=line_size;\
 484         }\
 485 }\
 486 \
 487 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 488 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 489 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 490 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 491 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 492 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 493 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 494
 495 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 496 #else // 64 bit variant
 497
 498 #define PIXOP2(OPNAME, OP) \
 499 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 500     int i;\
 501     for(i=0; i<h; i++){\
 502         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
 503         pixels+=line_size;\
 504         block +=line_size;\
 505     }\
 506 }\
 507 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 508     int i;\
 509     for(i=0; i<h; i++){\
 510         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 511         pixels+=line_size;\
 512         block +=line_size;\
 513     }\
 514 }\
 515 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 516     int i;\
 517     for(i=0; i<h; i++){\
 518         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 519         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 520         pixels+=line_size;\
 521         block +=line_size;\
 522     }\
 523 }\
 524 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 525     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 526 }\
 527 \
 528 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 529                                                 int src_stride1, int src_stride2, int h){\
 530     int i;\
 531     for(i=0; i<h; i++){\
 532         uint32_t a,b;\
 533         a= LD32(&src1[i*src_stride1  ]);\
 534         b= LD32(&src2[i*src_stride2  ]);\
 535         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 536         a= LD32(&src1[i*src_stride1+4]);\
 537         b= LD32(&src2[i*src_stride2+4]);\
 538         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 539     }\
 540 }\
 541 \
 542 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 543                                                 int src_stride1, int src_stride2, int h){\
 544     int i;\
 545     for(i=0; i<h; i++){\
 546         uint32_t a,b;\
 547         a= LD32(&src1[i*src_stride1  ]);\
 548         b= LD32(&src2[i*src_stride2  ]);\
 549         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 550         a= LD32(&src1[i*src_stride1+4]);\
 551         b= LD32(&src2[i*src_stride2+4]);\
 552         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 553     }\
 554 }\
 555 \
 556 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 557                                                 int src_stride1, int src_stride2, int h){\
 558     int i;\
 559     for(i=0; i<h; i++){\
 560         uint32_t a,b;\
 561         a= LD32(&src1[i*src_stride1  ]);\
 562         b= LD32(&src2[i*src_stride2  ]);\
 563         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 564     }\
 565 }\
 566 \
 567 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 568                                                 int src_stride1, int src_stride2, int h){\
 569     int i;\
 570     for(i=0; i<h; i++){\
 571         uint32_t a,b;\
 572         a= LD16(&src1[i*src_stride1  ]);\
 573         b= LD16(&src2[i*src_stride2  ]);\
 574         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 575     }\
 576 }\
 577 \
 578 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 579                                                 int src_stride1, int src_stride2, int h){\
 580     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 581     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 582 }\
 583 \
 584 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 585                                                 int src_stride1, int src_stride2, int h){\
 586     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 587     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 588 }\
 589 \
 590 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 591     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 592 }\
 593 \
 594 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 595     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 596 }\
 597 \
 598 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 599     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 600 }\
 601 \
 602 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 603     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 604 }\
 605 \
 606 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 607                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 608     int i;\
 609     for(i=0; i<h; i++){\
 610         uint32_t a, b, c, d, l0, l1, h0, h1;\
 611         a= LD32(&src1[i*src_stride1]);\
 612         b= LD32(&src2[i*src_stride2]);\
 613         c= LD32(&src3[i*src_stride3]);\
 614         d= LD32(&src4[i*src_stride4]);\
 615         l0=  (a&0x03030303UL)\
 616            + (b&0x03030303UL)\
 617            + 0x02020202UL;\
 618         h0= ((a&0xFCFCFCFCUL)>>2)\
 619           + ((b&0xFCFCFCFCUL)>>2);\
 620         l1=  (c&0x03030303UL)\
 621            + (d&0x03030303UL);\
 622         h1= ((c&0xFCFCFCFCUL)>>2)\
 623           + ((d&0xFCFCFCFCUL)>>2);\
 624         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 625         a= LD32(&src1[i*src_stride1+4]);\
 626         b= LD32(&src2[i*src_stride2+4]);\
 627         c= LD32(&src3[i*src_stride3+4]);\
 628         d= LD32(&src4[i*src_stride4+4]);\
 629         l0=  (a&0x03030303UL)\
 630            + (b&0x03030303UL)\
 631            + 0x02020202UL;\
 632         h0= ((a&0xFCFCFCFCUL)>>2)\
 633           + ((b&0xFCFCFCFCUL)>>2);\
 634         l1=  (c&0x03030303UL)\
 635            + (d&0x03030303UL);\
 636         h1= ((c&0xFCFCFCFCUL)>>2)\
 637           + ((d&0xFCFCFCFCUL)>>2);\
 638         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 639     }\
 640 }\
 641 \
 642 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 643     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 644 }\
 645 \
 646 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 647     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 648 }\
 649 \
 650 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 651     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 652 }\
 653 \
 654 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 655     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 656 }\
 657 \
 658 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 659                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 660     int i;\
 661     for(i=0; i<h; i++){\
 662         uint32_t a, b, c, d, l0, l1, h0, h1;\
 663         a= LD32(&src1[i*src_stride1]);\
 664         b= LD32(&src2[i*src_stride2]);\
 665         c= LD32(&src3[i*src_stride3]);\
 666         d= LD32(&src4[i*src_stride4]);\
 667         l0=  (a&0x03030303UL)\
 668            + (b&0x03030303UL)\
 669            + 0x01010101UL;\
 670         h0= ((a&0xFCFCFCFCUL)>>2)\
 671           + ((b&0xFCFCFCFCUL)>>2);\
 672         l1=  (c&0x03030303UL)\
 673            + (d&0x03030303UL);\
 674         h1= ((c&0xFCFCFCFCUL)>>2)\
 675           + ((d&0xFCFCFCFCUL)>>2);\
 676         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 677         a= LD32(&src1[i*src_stride1+4]);\
 678         b= LD32(&src2[i*src_stride2+4]);\
 679         c= LD32(&src3[i*src_stride3+4]);\
 680         d= LD32(&src4[i*src_stride4+4]);\
 681         l0=  (a&0x03030303UL)\
 682            + (b&0x03030303UL)\
 683            + 0x01010101UL;\
 684         h0= ((a&0xFCFCFCFCUL)>>2)\
 685           + ((b&0xFCFCFCFCUL)>>2);\
 686         l1=  (c&0x03030303UL)\
 687            + (d&0x03030303UL);\
 688         h1= ((c&0xFCFCFCFCUL)>>2)\
 689           + ((d&0xFCFCFCFCUL)>>2);\
 690         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 691     }\
 692 }\
 693 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 694                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 695     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 696     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 697 }\
 698 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 699                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 700     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 701     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 702 }\
 703 \
 704 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 705 {\
 706         int i, a0, b0, a1, b1;\
 707         a0= pixels[0];\
 708         b0= pixels[1] + 2;\
 709         a0 += b0;\
 710         b0 += pixels[2];\
 711 \
 712         pixels+=line_size;\
 713         for(i=0; i<h; i+=2){\
 714             a1= pixels[0];\
 715             b1= pixels[1];\
 716             a1 += b1;\
 717             b1 += pixels[2];\
 718 \
 719             block[0]= (a1+a0)>>2; /* FIXME non put */\
 720             block[1]= (b1+b0)>>2;\
 721 \
 722             pixels+=line_size;\
 723             block +=line_size;\
 724 \
 725             a0= pixels[0];\
 726             b0= pixels[1] + 2;\
 727             a0 += b0;\
 728             b0 += pixels[2];\
 729 \
 730             block[0]= (a1+a0)>>2;\
 731             block[1]= (b1+b0)>>2;\
 732             pixels+=line_size;\
 733             block +=line_size;\
 734         }\
 735 }\
 736 \
 737 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 738 {\
 739         int i;\
 740         const uint32_t a= LD32(pixels  );\
 741         const uint32_t b= LD32(pixels+1);\
 742         uint32_t l0=  (a&0x03030303UL)\
 743                     + (b&0x03030303UL)\
 744                     + 0x02020202UL;\
 745         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 746                    + ((b&0xFCFCFCFCUL)>>2);\
 747         uint32_t l1,h1;\
 748 \
 749         pixels+=line_size;\
 750         for(i=0; i<h; i+=2){\
 751             uint32_t a= LD32(pixels  );\
 752             uint32_t b= LD32(pixels+1);\
 753             l1=  (a&0x03030303UL)\
 754                + (b&0x03030303UL);\
 755             h1= ((a&0xFCFCFCFCUL)>>2)\
 756               + ((b&0xFCFCFCFCUL)>>2);\
 757             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 758             pixels+=line_size;\
 759             block +=line_size;\
 760             a= LD32(pixels  );\
 761             b= LD32(pixels+1);\
 762             l0=  (a&0x03030303UL)\
 763                + (b&0x03030303UL)\
 764                + 0x02020202UL;\
 765             h0= ((a&0xFCFCFCFCUL)>>2)\
 766               + ((b&0xFCFCFCFCUL)>>2);\
 767             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 768             pixels+=line_size;\
 769             block +=line_size;\
 770         }\
 771 }\
 772 \
 773 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 774 {\
 775     int j;\
 776     for(j=0; j<2; j++){\
 777         int i;\
 778         const uint32_t a= LD32(pixels  );\
 779         const uint32_t b= LD32(pixels+1);\
 780         uint32_t l0=  (a&0x03030303UL)\
 781                     + (b&0x03030303UL)\
 782                     + 0x02020202UL;\
 783         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 784                    + ((b&0xFCFCFCFCUL)>>2);\
 785         uint32_t l1,h1;\
 786 \
 787         pixels+=line_size;\
 788         for(i=0; i<h; i+=2){\
 789             uint32_t a= LD32(pixels  );\
 790             uint32_t b= LD32(pixels+1);\
 791             l1=  (a&0x03030303UL)\
 792                + (b&0x03030303UL);\
 793             h1= ((a&0xFCFCFCFCUL)>>2)\
 794               + ((b&0xFCFCFCFCUL)>>2);\
 795             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 796             pixels+=line_size;\
 797             block +=line_size;\
 798             a= LD32(pixels  );\
 799             b= LD32(pixels+1);\
 800             l0=  (a&0x03030303UL)\
 801                + (b&0x03030303UL)\
 802                + 0x02020202UL;\
 803             h0= ((a&0xFCFCFCFCUL)>>2)\
 804               + ((b&0xFCFCFCFCUL)>>2);\
 805             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 806             pixels+=line_size;\
 807             block +=line_size;\
 808         }\
 809         pixels+=4-line_size*(h+1);\
 810         block +=4-line_size*h;\
 811     }\
 812 }\
 813 \
 814 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 815 {\
 816     int j;\
 817     for(j=0; j<2; j++){\
 818         int i;\
 819         const uint32_t a= LD32(pixels  );\
 820         const uint32_t b= LD32(pixels+1);\
 821         uint32_t l0=  (a&0x03030303UL)\
 822                     + (b&0x03030303UL)\
 823                     + 0x01010101UL;\
 824         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 825                    + ((b&0xFCFCFCFCUL)>>2);\
 826         uint32_t l1,h1;\
 827 \
 828         pixels+=line_size;\
 829         for(i=0; i<h; i+=2){\
 830             uint32_t a= LD32(pixels  );\
 831             uint32_t b= LD32(pixels+1);\
 832             l1=  (a&0x03030303UL)\
 833                + (b&0x03030303UL);\
 834             h1= ((a&0xFCFCFCFCUL)>>2)\
 835               + ((b&0xFCFCFCFCUL)>>2);\
 836             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 837             pixels+=line_size;\
 838             block +=line_size;\
 839             a= LD32(pixels  );\
 840             b= LD32(pixels+1);\
 841             l0=  (a&0x03030303UL)\
 842                + (b&0x03030303UL)\
 843                + 0x01010101UL;\
 844             h0= ((a&0xFCFCFCFCUL)>>2)\
 845               + ((b&0xFCFCFCFCUL)>>2);\
 846             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 847             pixels+=line_size;\
 848             block +=line_size;\
 849         }\
 850         pixels+=4-line_size*(h+1);\
 851         block +=4-line_size*h;\
 852     }\
 853 }\
 854 \
 855 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
 856 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
 857 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
 858 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
 859 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
 860 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
 861 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
 862 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
 863
 864 #define op_avg(a, b) a = rnd_avg32(a, b)
 865 #endif
 866 #define op_put(a, b) a = b
 867
 868 PIXOP2(avg, op_avg)
 869 PIXOP2(put, op_put)
 870 #undef op_avg
 871 #undef op_put
 872
 873 #define avg2(a,b) ((a+b+1)>>1)
 874 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 875
 876
 877 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 878 {
 879     const int A=(16-x16)*(16-y16);
 880     const int B=(   x16)*(16-y16);
 881     const int C=(16-x16)*(   y16);
 882     const int D=(   x16)*(   y16);
 883     int i;
 884
 885     for(i=0; i<h; i++)
 886     {
 887         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 888         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 889         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 890         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 891         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 892         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 893         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 894         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 895         dst+= stride;
 896         src+= stride;
 897     }
 898 }
 899
 900 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 901                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 902 {
 903     int y, vx, vy;
 904     const int s= 1<<shift;
 905
 906     width--;
 907     height--;
 908
 909     for(y=0; y<h; y++){
 910         int x;
 911
 912         vx= ox;
 913         vy= oy;
 914         for(x=0; x<8; x++){ //XXX FIXME optimize
 915             int src_x, src_y, frac_x, frac_y, index;
 916
 917             src_x= vx>>16;
 918             src_y= vy>>16;
 919             frac_x= src_x&(s-1);
 920             frac_y= src_y&(s-1);
 921             src_x>>=shift;
 922             src_y>>=shift;
 923
 924             if((unsigned)src_x < width){
 925                 if((unsigned)src_y < height){
 926                     index= src_x + src_y*stride;
 927                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 928                                            + src[index       +1]*   frac_x )*(s-frac_y)
 929                                         + (  src[index+stride  ]*(s-frac_x)
 930                                            + src[index+stride+1]*   frac_x )*   frac_y
 931                                         + r)>>(shift*2);
 932                 }else{
 933                     index= src_x + clip(src_y, 0, height)*stride;
 934                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 935                                           + src[index       +1]*   frac_x )*s
 936                                         + r)>>(shift*2);
 937                 }
 938             }else{
 939                 if((unsigned)src_y < height){
 940                     index= clip(src_x, 0, width) + src_y*stride;
 941                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 942                                            + src[index+stride  ]*   frac_y )*s
 943                                         + r)>>(shift*2);
 944                 }else{
 945                     index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
 946                     dst[y*stride + x]=    src[index         ];
 947                 }
 948             }
 949
 950             vx+= dxx;
 951             vy+= dyx;
 952         }
 953         ox += dxy;
 954         oy += dyy;
 955     }
 956 }
 957
 958 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 959     switch(width){
 960     case 2: put_pixels2_c (dst, src, stride, height); break;
 961     case 4: put_pixels4_c (dst, src, stride, height); break;
 962     case 8: put_pixels8_c (dst, src, stride, height); break;
 963     case 16:put_pixels16_c(dst, src, stride, height); break;
 964     }
 965 }
 966
 967 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 968     int i,j;
 969     for (i=0; i < height; i++) {
 970       for (j=0; j < width; j++) {
 971         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 972       }
 973       src += stride;
 974       dst += stride;
 975     }
 976 }
 977
 978 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 979     int i,j;
 980     for (i=0; i < height; i++) {
 981       for (j=0; j < width; j++) {
 982         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 983       }
 984       src += stride;
 985       dst += stride;
 986     }
 987 }
 988
 989 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 990     int i,j;
 991     for (i=0; i < height; i++) {
 992       for (j=0; j < width; j++) {
 993         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 994       }
 995       src += stride;
 996       dst += stride;
 997     }
 998 }
 999
1000 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1001     int i,j;
1002     for (i=0; i < height; i++) {
1003       for (j=0; j < width; j++) {
1004         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1005       }
1006       src += stride;
1007       dst += stride;
1008     }
1009 }
1010
1011 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1012     int i,j;
1013     for (i=0; i < height; i++) {
1014       for (j=0; j < width; j++) {
1015         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1016       }
1017       src += stride;
1018       dst += stride;
1019     }
1020 }
1021
1022 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1023     int i,j;
1024     for (i=0; i < height; i++) {
1025       for (j=0; j < width; j++) {
1026         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1027       }
1028       src += stride;
1029       dst += stride;
1030     }
1031 }
1032
1033 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1034     int i,j;
1035     for (i=0; i < height; i++) {
1036       for (j=0; j < width; j++) {
1037         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1038       }
1039       src += stride;
1040       dst += stride;
1041     }
1042 }
1043
1044 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1045     int i,j;
1046     for (i=0; i < height; i++) {
1047       for (j=0; j < width; j++) {
1048         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1049       }
1050       src += stride;
1051       dst += stride;
1052     }
1053 }
1054
1055 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1056     switch(width){
1057     case 2: avg_pixels2_c (dst, src, stride, height); break;
1058     case 4: avg_pixels4_c (dst, src, stride, height); break;
1059     case 8: avg_pixels8_c (dst, src, stride, height); break;
1060     case 16:avg_pixels16_c(dst, src, stride, height); break;
1061     }
1062 }
1063
1064 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1065     int i,j;
1066     for (i=0; i < height; i++) {
1067       for (j=0; j < width; j++) {
1068         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1069       }
1070       src += stride;
1071       dst += stride;
1072     }
1073 }
1074
1075 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1076     int i,j;
1077     for (i=0; i < height; i++) {
1078       for (j=0; j < width; j++) {
1079         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1080       }
1081       src += stride;
1082       dst += stride;
1083     }
1084 }
1085
1086 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1087     int i,j;
1088     for (i=0; i < height; i++) {
1089       for (j=0; j < width; j++) {
1090         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1091       }
1092       src += stride;
1093       dst += stride;
1094     }
1095 }
1096
1097 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1098     int i,j;
1099     for (i=0; i < height; i++) {
1100       for (j=0; j < width; j++) {
1101         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1102       }
1103       src += stride;
1104       dst += stride;
1105     }
1106 }
1107
1108 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1109     int i,j;
1110     for (i=0; i < height; i++) {
1111       for (j=0; j < width; j++) {
1112         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1113       }
1114       src += stride;
1115       dst += stride;
1116     }
1117 }
1118
1119 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1120     int i,j;
1121     for (i=0; i < height; i++) {
1122       for (j=0; j < width; j++) {
1123         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1124       }
1125       src += stride;
1126       dst += stride;
1127     }
1128 }
1129
1130 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1131     int i,j;
1132     for (i=0; i < height; i++) {
1133       for (j=0; j < width; j++) {
1134         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1135       }
1136       src += stride;
1137       dst += stride;
1138     }
1139 }
1140
1141 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1142     int i,j;
1143     for (i=0; i < height; i++) {
1144       for (j=0; j < width; j++) {
1145         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1146       }
1147       src += stride;
1148       dst += stride;
1149     }
1150 }
1151 #if 0
1152 #define TPEL_WIDTH(width)\
1153 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1154     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1155 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1156     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1157 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1158     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1159 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1160     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1161 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1162     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1163 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1164     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1165 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1166     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1167 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1168     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1169 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1170     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1171 #endif
1172
1173 #define H264_CHROMA_MC(OPNAME, OP)\
1174 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1175     const int A=(8-x)*(8-y);\
1176     const int B=(  x)*(8-y);\
1177     const int C=(8-x)*(  y);\
1178     const int D=(  x)*(  y);\
1179     int i;\
1180     \
1181     assert(x<8 && y<8 && x>=0 && y>=0);\
1182 \
1183     for(i=0; i<h; i++)\
1184     {\
1185         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1186         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1187         dst+= stride;\
1188         src+= stride;\
1189     }\
1190 }\
1191 \
1192 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1193     const int A=(8-x)*(8-y);\
1194     const int B=(  x)*(8-y);\
1195     const int C=(8-x)*(  y);\
1196     const int D=(  x)*(  y);\
1197     int i;\
1198     \
1199     assert(x<8 && y<8 && x>=0 && y>=0);\
1200 \
1201     for(i=0; i<h; i++)\
1202     {\
1203         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1204         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1205         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1206         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1207         dst+= stride;\
1208         src+= stride;\
1209     }\
1210 }\
1211 \
1212 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1213     const int A=(8-x)*(8-y);\
1214     const int B=(  x)*(8-y);\
1215     const int C=(8-x)*(  y);\
1216     const int D=(  x)*(  y);\
1217     int i;\
1218     \
1219     assert(x<8 && y<8 && x>=0 && y>=0);\
1220 \
1221     for(i=0; i<h; i++)\
1222     {\
1223         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1224         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1225         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1226         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1227         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1228         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1229         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1230         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1231         dst+= stride;\
1232         src+= stride;\
1233     }\
1234 }
1235
1236 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1237 #define op_put(a, b) a = (((b) + 32)>>6)
1238
1239 H264_CHROMA_MC(put_       , op_put)
1240 H264_CHROMA_MC(avg_       , op_avg)
1241 #undef op_avg
1242 #undef op_put
1243
1244 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1245 {
1246     int i;
1247     for(i=0; i<h; i++)
1248     {
1249         ST32(dst   , LD32(src   ));
1250         dst+=dstStride;
1251         src+=srcStride;
1252     }
1253 }
1254
1255 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1256 {
1257     int i;
1258     for(i=0; i<h; i++)
1259     {
1260         ST32(dst   , LD32(src   ));
1261         ST32(dst+4 , LD32(src+4 ));
1262         dst+=dstStride;
1263         src+=srcStride;
1264     }
1265 }
1266
1267 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1268 {
1269     int i;
1270     for(i=0; i<h; i++)
1271     {
1272         ST32(dst   , LD32(src   ));
1273         ST32(dst+4 , LD32(src+4 ));
1274         ST32(dst+8 , LD32(src+8 ));
1275         ST32(dst+12, LD32(src+12));
1276         dst+=dstStride;
1277         src+=srcStride;
1278     }
1279 }
1280
1281 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1282 {
1283     int i;
1284     for(i=0; i<h; i++)
1285     {
1286         ST32(dst   , LD32(src   ));
1287         ST32(dst+4 , LD32(src+4 ));
1288         ST32(dst+8 , LD32(src+8 ));
1289         ST32(dst+12, LD32(src+12));
1290         dst[16]= src[16];
1291         dst+=dstStride;
1292         src+=srcStride;
1293     }
1294 }
1295
1296 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1297 {
1298     int i;
1299     for(i=0; i<h; i++)
1300     {
1301         ST32(dst   , LD32(src   ));
1302         ST32(dst+4 , LD32(src+4 ));
1303         dst[8]= src[8];
1304         dst+=dstStride;
1305         src+=srcStride;
1306     }
1307 }
1308
1309
1310 #define QPEL_MC(r, OPNAME, RND, OP) \
1311 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1312     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1313     int i;\
1314     for(i=0; i<h; i++)\
1315     {\
1316         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1317         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1318         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1319         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1320         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1321         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1322         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1323         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1324         dst+=dstStride;\
1325         src+=srcStride;\
1326     }\
1327 }\
1328 \
1329 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1330     const int w=8;\
1331     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1332     int i;\
1333     for(i=0; i<w; i++)\
1334     {\
1335         const int src0= src[0*srcStride];\
1336         const int src1= src[1*srcStride];\
1337         const int src2= src[2*srcStride];\
1338         const int src3= src[3*srcStride];\
1339         const int src4= src[4*srcStride];\
1340         const int src5= src[5*srcStride];\
1341         const int src6= src[6*srcStride];\
1342         const int src7= src[7*srcStride];\
1343         const int src8= src[8*srcStride];\
1344         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1345         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1346         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1347         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1348         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1349         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1350         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1351         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1352         dst++;\
1353         src++;\
1354     }\
1355 }\
1356 \
1357 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1358     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1359     int i;\
1360     \
1361     for(i=0; i<h; i++)\
1362     {\
1363         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1364         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1365         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1366         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1367         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1368         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1369         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1370         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1371         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1372         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1373         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1374         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1375         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1376         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1377         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1378         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1379         dst+=dstStride;\
1380         src+=srcStride;\
1381     }\
1382 }\
1383 \
1384 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1385     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1386     int i;\
1387     const int w=16;\
1388     for(i=0; i<w; i++)\
1389     {\
1390         const int src0= src[0*srcStride];\
1391         const int src1= src[1*srcStride];\
1392         const int src2= src[2*srcStride];\
1393         const int src3= src[3*srcStride];\
1394         const int src4= src[4*srcStride];\
1395         const int src5= src[5*srcStride];\
1396         const int src6= src[6*srcStride];\
1397         const int src7= src[7*srcStride];\
1398         const int src8= src[8*srcStride];\
1399         const int src9= src[9*srcStride];\
1400         const int src10= src[10*srcStride];\
1401         const int src11= src[11*srcStride];\
1402         const int src12= src[12*srcStride];\
1403         const int src13= src[13*srcStride];\
1404         const int src14= src[14*srcStride];\
1405         const int src15= src[15*srcStride];\
1406         const int src16= src[16*srcStride];\
1407         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1408         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1409         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1410         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1411         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1412         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1413         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1414         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1415         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1416         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1417         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1418         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1419         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1420         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1421         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1422         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1423         dst++;\
1424         src++;\
1425     }\
1426 }\
1427 \
1428 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1429     OPNAME ## pixels8_c(dst, src, stride, 8);\
1430 }\
1431 \
1432 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1433     uint8_t half[64];\
1434     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1435     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1436 }\
1437 \
1438 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1439     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1440 }\
1441 \
1442 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1443     uint8_t half[64];\
1444     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1445     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1446 }\
1447 \
1448 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1449     uint8_t full[16*9];\
1450     uint8_t half[64];\
1451     copy_block9(full, src, 16, stride, 9);\
1452     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1453     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1454 }\
1455 \
1456 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1457     uint8_t full[16*9];\
1458     copy_block9(full, src, 16, stride, 9);\
1459     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1460 }\
1461 \
1462 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1463     uint8_t full[16*9];\
1464     uint8_t half[64];\
1465     copy_block9(full, src, 16, stride, 9);\
1466     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1467     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1468 }\
1469 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1470     uint8_t full[16*9];\
1471     uint8_t halfH[72];\
1472     uint8_t halfV[64];\
1473     uint8_t halfHV[64];\
1474     copy_block9(full, src, 16, stride, 9);\
1475     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1476     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1477     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1478     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1479 }\
1480 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1481     uint8_t full[16*9];\
1482     uint8_t halfH[72];\
1483     uint8_t halfHV[64];\
1484     copy_block9(full, src, 16, stride, 9);\
1485     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1486     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1487     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1488     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1489 }\
1490 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1491     uint8_t full[16*9];\
1492     uint8_t halfH[72];\
1493     uint8_t halfV[64];\
1494     uint8_t halfHV[64];\
1495     copy_block9(full, src, 16, stride, 9);\
1496     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1497     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1498     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1499     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1500 }\
1501 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1502     uint8_t full[16*9];\
1503     uint8_t halfH[72];\
1504     uint8_t halfHV[64];\
1505     copy_block9(full, src, 16, stride, 9);\
1506     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1507     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1508     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1509     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1510 }\
1511 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1512     uint8_t full[16*9];\
1513     uint8_t halfH[72];\
1514     uint8_t halfV[64];\
1515     uint8_t halfHV[64];\
1516     copy_block9(full, src, 16, stride, 9);\
1517     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1518     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1519     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1520     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1521 }\
1522 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1523     uint8_t full[16*9];\
1524     uint8_t halfH[72];\
1525     uint8_t halfHV[64];\
1526     copy_block9(full, src, 16, stride, 9);\
1527     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1528     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1529     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1530     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1531 }\
1532 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1533     uint8_t full[16*9];\
1534     uint8_t halfH[72];\
1535     uint8_t halfV[64];\
1536     uint8_t halfHV[64];\
1537     copy_block9(full, src, 16, stride, 9);\
1538     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1539     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1540     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1541     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1542 }\
1543 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1544     uint8_t full[16*9];\
1545     uint8_t halfH[72];\
1546     uint8_t halfHV[64];\
1547     copy_block9(full, src, 16, stride, 9);\
1548     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1549     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1550     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1551     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1552 }\
1553 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1554     uint8_t halfH[72];\
1555     uint8_t halfHV[64];\
1556     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1557     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1558     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1559 }\
1560 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1561     uint8_t halfH[72];\
1562     uint8_t halfHV[64];\
1563     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1564     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1565     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1566 }\
1567 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1568     uint8_t full[16*9];\
1569     uint8_t halfH[72];\
1570     uint8_t halfV[64];\
1571     uint8_t halfHV[64];\
1572     copy_block9(full, src, 16, stride, 9);\
1573     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1574     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1575     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1576     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1577 }\
1578 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1579     uint8_t full[16*9];\
1580     uint8_t halfH[72];\
1581     copy_block9(full, src, 16, stride, 9);\
1582     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1583     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1584     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1585 }\
1586 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1587     uint8_t full[16*9];\
1588     uint8_t halfH[72];\
1589     uint8_t halfV[64];\
1590     uint8_t halfHV[64];\
1591     copy_block9(full, src, 16, stride, 9);\
1592     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1593     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1594     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1595     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1596 }\
1597 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1598     uint8_t full[16*9];\
1599     uint8_t halfH[72];\
1600     copy_block9(full, src, 16, stride, 9);\
1601     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1602     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1603     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1604 }\
1605 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1606     uint8_t halfH[72];\
1607     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1608     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1609 }\
1610 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1611     OPNAME ## pixels16_c(dst, src, stride, 16);\
1612 }\
1613 \
1614 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1615     uint8_t half[256];\
1616     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1617     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1618 }\
1619 \
1620 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1621     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1622 }\
1623 \
1624 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1625     uint8_t half[256];\
1626     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1627     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1628 }\
1629 \
1630 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1631     uint8_t full[24*17];\
1632     uint8_t half[256];\
1633     copy_block17(full, src, 24, stride, 17);\
1634     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1635     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1636 }\
1637 \
1638 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1639     uint8_t full[24*17];\
1640     copy_block17(full, src, 24, stride, 17);\
1641     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1642 }\
1643 \
1644 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1645     uint8_t full[24*17];\
1646     uint8_t half[256];\
1647     copy_block17(full, src, 24, stride, 17);\
1648     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1649     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1650 }\
1651 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1652     uint8_t full[24*17];\
1653     uint8_t halfH[272];\
1654     uint8_t halfV[256];\
1655     uint8_t halfHV[256];\
1656     copy_block17(full, src, 24, stride, 17);\
1657     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1658     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1659     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1660     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1661 }\
1662 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1663     uint8_t full[24*17];\
1664     uint8_t halfH[272];\
1665     uint8_t halfHV[256];\
1666     copy_block17(full, src, 24, stride, 17);\
1667     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1668     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1669     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1670     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1671 }\
1672 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1673     uint8_t full[24*17];\
1674     uint8_t halfH[272];\
1675     uint8_t halfV[256];\
1676     uint8_t halfHV[256];\
1677     copy_block17(full, src, 24, stride, 17);\
1678     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1679     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1680     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1681     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1682 }\
1683 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1684     uint8_t full[24*17];\
1685     uint8_t halfH[272];\
1686     uint8_t halfHV[256];\
1687     copy_block17(full, src, 24, stride, 17);\
1688     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1689     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1690     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1691     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1692 }\
1693 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1694     uint8_t full[24*17];\
1695     uint8_t halfH[272];\
1696     uint8_t halfV[256];\
1697     uint8_t halfHV[256];\
1698     copy_block17(full, src, 24, stride, 17);\
1699     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1700     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1701     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1702     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1703 }\
1704 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1705     uint8_t full[24*17];\
1706     uint8_t halfH[272];\
1707     uint8_t halfHV[256];\
1708     copy_block17(full, src, 24, stride, 17);\
1709     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1710     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1711     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1712     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1713 }\
1714 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1715     uint8_t full[24*17];\
1716     uint8_t halfH[272];\
1717     uint8_t halfV[256];\
1718     uint8_t halfHV[256];\
1719     copy_block17(full, src, 24, stride, 17);\
1720     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1721     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1722     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1723     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1724 }\
1725 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1726     uint8_t full[24*17];\
1727     uint8_t halfH[272];\
1728     uint8_t halfHV[256];\
1729     copy_block17(full, src, 24, stride, 17);\
1730     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1731     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1732     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1733     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1734 }\
1735 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1736     uint8_t halfH[272];\
1737     uint8_t halfHV[256];\
1738     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1739     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1740     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1741 }\
1742 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1743     uint8_t halfH[272];\
1744     uint8_t halfHV[256];\
1745     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1746     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1747     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1748 }\
1749 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1750     uint8_t full[24*17];\
1751     uint8_t halfH[272];\
1752     uint8_t halfV[256];\
1753     uint8_t halfHV[256];\
1754     copy_block17(full, src, 24, stride, 17);\
1755     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1756     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1757     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1758     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1759 }\
1760 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1761     uint8_t full[24*17];\
1762     uint8_t halfH[272];\
1763     copy_block17(full, src, 24, stride, 17);\
1764     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1765     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1766     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1767 }\
1768 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1769     uint8_t full[24*17];\
1770     uint8_t halfH[272];\
1771     uint8_t halfV[256];\
1772     uint8_t halfHV[256];\
1773     copy_block17(full, src, 24, stride, 17);\
1774     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1775     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1776     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1777     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1778 }\
1779 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1780     uint8_t full[24*17];\
1781     uint8_t halfH[272];\
1782     copy_block17(full, src, 24, stride, 17);\
1783     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1784     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1785     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1786 }\
1787 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1788     uint8_t halfH[272];\
1789     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1790     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1791 }
1792
1793 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1794 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1795 #define op_put(a, b) a = cm[((b) + 16)>>5]
1796 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1797
1798 QPEL_MC(0, put_       , _       , op_put)
1799 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1800 QPEL_MC(0, avg_       , _       , op_avg)
1801 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1802 #undef op_avg
1803 #undef op_avg_no_rnd
1804 #undef op_put
1805 #undef op_put_no_rnd
1806
1807 #if 1
1808 #define H264_LOWPASS(OPNAME, OP, OP2) \
1809 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1810     const int h=4;\
1811     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1812     int i;\
1813     for(i=0; i<h; i++)\
1814     {\
1815         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1816         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1817         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1818         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1819         dst+=dstStride;\
1820         src+=srcStride;\
1821     }\
1822 }\
1823 \
1824 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1825     const int w=4;\
1826     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1827     int i;\
1828     for(i=0; i<w; i++)\
1829     {\
1830         const int srcB= src[-2*srcStride];\
1831         const int srcA= src[-1*srcStride];\
1832         const int src0= src[0 *srcStride];\
1833         const int src1= src[1 *srcStride];\
1834         const int src2= src[2 *srcStride];\
1835         const int src3= src[3 *srcStride];\
1836         const int src4= src[4 *srcStride];\
1837         const int src5= src[5 *srcStride];\
1838         const int src6= src[6 *srcStride];\
1839         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1840         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1841         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1842         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1843         dst++;\
1844         src++;\
1845     }\
1846 }\
1847 \
1848 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1849     const int h=4;\
1850     const int w=4;\
1851     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1852     int i;\
1853     src -= 2*srcStride;\
1854     for(i=0; i<h+5; i++)\
1855     {\
1856         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1857         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1858         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1859         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1860         tmp+=tmpStride;\
1861         src+=srcStride;\
1862     }\
1863     tmp -= tmpStride*(h+5-2);\
1864     for(i=0; i<w; i++)\
1865     {\
1866         const int tmpB= tmp[-2*tmpStride];\
1867         const int tmpA= tmp[-1*tmpStride];\
1868         const int tmp0= tmp[0 *tmpStride];\
1869         const int tmp1= tmp[1 *tmpStride];\
1870         const int tmp2= tmp[2 *tmpStride];\
1871         const int tmp3= tmp[3 *tmpStride];\
1872         const int tmp4= tmp[4 *tmpStride];\
1873         const int tmp5= tmp[5 *tmpStride];\
1874         const int tmp6= tmp[6 *tmpStride];\
1875         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1876         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1877         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1878         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1879         dst++;\
1880         tmp++;\
1881     }\
1882 }\
1883 \
1884 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1885     const int h=8;\
1886     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1887     int i;\
1888     for(i=0; i<h; i++)\
1889     {\
1890         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1891         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1892         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1893         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1894         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1895         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1896         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1897         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1898         dst+=dstStride;\
1899         src+=srcStride;\
1900     }\
1901 }\
1902 \
1903 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1904     const int w=8;\
1905     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1906     int i;\
1907     for(i=0; i<w; i++)\
1908     {\
1909         const int srcB= src[-2*srcStride];\
1910         const int srcA= src[-1*srcStride];\
1911         const int src0= src[0 *srcStride];\
1912         const int src1= src[1 *srcStride];\
1913         const int src2= src[2 *srcStride];\
1914         const int src3= src[3 *srcStride];\
1915         const int src4= src[4 *srcStride];\
1916         const int src5= src[5 *srcStride];\
1917         const int src6= src[6 *srcStride];\
1918         const int src7= src[7 *srcStride];\
1919         const int src8= src[8 *srcStride];\
1920         const int src9= src[9 *srcStride];\
1921         const int src10=src[10*srcStride];\
1922         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1923         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1924         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1925         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1926         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1927         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1928         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1929         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1930         dst++;\
1931         src++;\
1932     }\
1933 }\
1934 \
1935 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1936     const int h=8;\
1937     const int w=8;\
1938     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1939     int i;\
1940     src -= 2*srcStride;\
1941     for(i=0; i<h+5; i++)\
1942     {\
1943         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1944         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1945         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1946         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1947         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1948         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1949         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1950         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1951         tmp+=tmpStride;\
1952         src+=srcStride;\
1953     }\
1954     tmp -= tmpStride*(h+5-2);\
1955     for(i=0; i<w; i++)\
1956     {\
1957         const int tmpB= tmp[-2*tmpStride];\
1958         const int tmpA= tmp[-1*tmpStride];\
1959         const int tmp0= tmp[0 *tmpStride];\
1960         const int tmp1= tmp[1 *tmpStride];\
1961         const int tmp2= tmp[2 *tmpStride];\
1962         const int tmp3= tmp[3 *tmpStride];\
1963         const int tmp4= tmp[4 *tmpStride];\
1964         const int tmp5= tmp[5 *tmpStride];\
1965         const int tmp6= tmp[6 *tmpStride];\
1966         const int tmp7= tmp[7 *tmpStride];\
1967         const int tmp8= tmp[8 *tmpStride];\
1968         const int tmp9= tmp[9 *tmpStride];\
1969         const int tmp10=tmp[10*tmpStride];\
1970         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1971         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1972         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1973         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1974         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1975         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1976         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1977         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1978         dst++;\
1979         tmp++;\
1980     }\
1981 }\
1982 \
1983 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1984     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1985     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1986     src += 8*srcStride;\
1987     dst += 8*dstStride;\
1988     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1989     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1990 }\
1991 \
1992 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1993     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1994     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1995     src += 8*srcStride;\
1996     dst += 8*dstStride;\
1997     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1998     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1999 }\
2000 \
2001 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2002     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2003     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2004     src += 8*srcStride;\
2005     tmp += 8*tmpStride;\
2006     dst += 8*dstStride;\
2007     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2008     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2009 }\
2010
2011 #define H264_MC(OPNAME, SIZE) \
2012 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2013     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2014 }\
2015 \
2016 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2017     uint8_t half[SIZE*SIZE];\
2018     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2019     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2020 }\
2021 \
2022 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2023     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2024 }\
2025 \
2026 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2027     uint8_t half[SIZE*SIZE];\
2028     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2029     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2030 }\
2031 \
2032 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2033     uint8_t full[SIZE*(SIZE+5)];\
2034     uint8_t * const full_mid= full + SIZE*2;\
2035     uint8_t half[SIZE*SIZE];\
2036     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2037     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2038     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2039 }\
2040 \
2041 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2042     uint8_t full[SIZE*(SIZE+5)];\
2043     uint8_t * const full_mid= full + SIZE*2;\
2044     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2045     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2046 }\
2047 \
2048 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2049     uint8_t full[SIZE*(SIZE+5)];\
2050     uint8_t * const full_mid= full + SIZE*2;\
2051     uint8_t half[SIZE*SIZE];\
2052     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2053     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2054     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2055 }\
2056 \
2057 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2058     uint8_t full[SIZE*(SIZE+5)];\
2059     uint8_t * const full_mid= full + SIZE*2;\
2060     uint8_t halfH[SIZE*SIZE];\
2061     uint8_t halfV[SIZE*SIZE];\
2062     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2063     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2064     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2065     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2066 }\
2067 \
2068 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2069     uint8_t full[SIZE*(SIZE+5)];\
2070     uint8_t * const full_mid= full + SIZE*2;\
2071     uint8_t halfH[SIZE*SIZE];\
2072     uint8_t halfV[SIZE*SIZE];\
2073     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2074     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2075     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2076     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2077 }\
2078 \
2079 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2080     uint8_t full[SIZE*(SIZE+5)];\
2081     uint8_t * const full_mid= full + SIZE*2;\
2082     uint8_t halfH[SIZE*SIZE];\
2083     uint8_t halfV[SIZE*SIZE];\
2084     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2085     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2086     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2087     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2088 }\
2089 \
2090 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2091     uint8_t full[SIZE*(SIZE+5)];\
2092     uint8_t * const full_mid= full + SIZE*2;\
2093     uint8_t halfH[SIZE*SIZE];\
2094     uint8_t halfV[SIZE*SIZE];\
2095     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2096     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2097     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2098     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2099 }\
2100 \
2101 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2102     int16_t tmp[SIZE*(SIZE+5)];\
2103     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2104 }\
2105 \
2106 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2107     int16_t tmp[SIZE*(SIZE+5)];\
2108     uint8_t halfH[SIZE*SIZE];\
2109     uint8_t halfHV[SIZE*SIZE];\
2110     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2111     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2112     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2113 }\
2114 \
2115 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2116     int16_t tmp[SIZE*(SIZE+5)];\
2117     uint8_t halfH[SIZE*SIZE];\
2118     uint8_t halfHV[SIZE*SIZE];\
2119     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2120     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2121     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2122 }\
2123 \
2124 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2125     uint8_t full[SIZE*(SIZE+5)];\
2126     uint8_t * const full_mid= full + SIZE*2;\
2127     int16_t tmp[SIZE*(SIZE+5)];\
2128     uint8_t halfV[SIZE*SIZE];\
2129     uint8_t halfHV[SIZE*SIZE];\
2130     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2131     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2132     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2133     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2134 }\
2135 \
2136 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2137     uint8_t full[SIZE*(SIZE+5)];\
2138     uint8_t * const full_mid= full + SIZE*2;\
2139     int16_t tmp[SIZE*(SIZE+5)];\
2140     uint8_t halfV[SIZE*SIZE];\
2141     uint8_t halfHV[SIZE*SIZE];\
2142     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2143     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2144     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2145     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2146 }\
2147
2148 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2149 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2150 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2151 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2152 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2153
2154 H264_LOWPASS(put_       , op_put, op2_put)
2155 H264_LOWPASS(avg_       , op_avg, op2_avg)
2156 H264_MC(put_, 4)
2157 H264_MC(put_, 8)
2158 H264_MC(put_, 16)
2159 H264_MC(avg_, 4)
2160 H264_MC(avg_, 8)
2161 H264_MC(avg_, 16)
2162
2163 #undef op_avg
2164 #undef op_put
2165 #undef op2_avg
2166 #undef op2_put
2167 #endif
2168
2169 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2170     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2171     int i;
2172
2173     for(i=0; i<h; i++){
2174         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2175         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2176         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2177         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2178         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2179         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2180         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2181         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2182         dst+=dstStride;
2183         src+=srcStride;
2184     }
2185 }
2186
2187 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2188     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2189     int i;
2190
2191     for(i=0; i<w; i++){
2192         const int src_1= src[ -srcStride];
2193         const int src0 = src[0          ];
2194         const int src1 = src[  srcStride];
2195         const int src2 = src[2*srcStride];
2196         const int src3 = src[3*srcStride];
2197         const int src4 = src[4*srcStride];
2198         const int src5 = src[5*srcStride];
2199         const int src6 = src[6*srcStride];
2200         const int src7 = src[7*srcStride];
2201         const int src8 = src[8*srcStride];
2202         const int src9 = src[9*srcStride];
2203         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2204         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2205         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2206         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2207         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2208         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2209         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2210         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2211         src++;
2212         dst++;
2213     }
2214 }
2215
2216 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2217     put_pixels8_c(dst, src, stride, 8);
2218 }
2219
2220 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2221     uint8_t half[64];
2222     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2223     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2224 }
2225
2226 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2227     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2228 }
2229
2230 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2231     uint8_t half[64];
2232     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2233     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2234 }
2235
2236 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2237     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2238 }
2239
2240 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2241     uint8_t halfH[88];
2242     uint8_t halfV[64];
2243     uint8_t halfHV[64];
2244     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2245     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2246     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2247     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2248 }
2249 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2250     uint8_t halfH[88];
2251     uint8_t halfV[64];
2252     uint8_t halfHV[64];
2253     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2254     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2255     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2256     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2257 }
2258 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2259     uint8_t halfH[88];
2260     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2261     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2262 }
2263
2264 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2265     int x;
2266     const int strength= ff_h263_loop_filter_strength[qscale];
2267
2268     for(x=0; x<8; x++){
2269         int d1, d2, ad1;
2270         int p0= src[x-2*stride];
2271         int p1= src[x-1*stride];
2272         int p2= src[x+0*stride];
2273         int p3= src[x+1*stride];
2274         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2275
2276         if     (d<-2*strength) d1= 0;
2277         else if(d<-  strength) d1=-2*strength - d;
2278         else if(d<   strength) d1= d;
2279         else if(d< 2*strength) d1= 2*strength - d;
2280         else                   d1= 0;
2281
2282         p1 += d1;
2283         p2 -= d1;
2284         if(p1&256) p1= ~(p1>>31);
2285         if(p2&256) p2= ~(p2>>31);
2286
2287         src[x-1*stride] = p1;
2288         src[x+0*stride] = p2;
2289
2290         ad1= ABS(d1)>>1;
2291
2292         d2= clip((p0-p3)/4, -ad1, ad1);
2293
2294         src[x-2*stride] = p0 - d2;
2295         src[x+  stride] = p3 + d2;
2296     }
2297 }
2298
2299 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2300     int y;
2301     const int strength= ff_h263_loop_filter_strength[qscale];
2302
2303     for(y=0; y<8; y++){
2304         int d1, d2, ad1;
2305         int p0= src[y*stride-2];
2306         int p1= src[y*stride-1];
2307         int p2= src[y*stride+0];
2308         int p3= src[y*stride+1];
2309         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2310
2311         if     (d<-2*strength) d1= 0;
2312         else if(d<-  strength) d1=-2*strength - d;
2313         else if(d<   strength) d1= d;
2314         else if(d< 2*strength) d1= 2*strength - d;
2315         else                   d1= 0;
2316
2317         p1 += d1;
2318         p2 -= d1;
2319         if(p1&256) p1= ~(p1>>31);
2320         if(p2&256) p2= ~(p2>>31);
2321
2322         src[y*stride-1] = p1;
2323         src[y*stride+0] = p2;
2324
2325         ad1= ABS(d1)>>1;
2326
2327         d2= clip((p0-p3)/4, -ad1, ad1);
2328
2329         src[y*stride-2] = p0 - d2;
2330         src[y*stride+1] = p3 + d2;
2331     }
2332 }
2333
2334 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2335 {
2336     int s, i;
2337
2338     s = 0;
2339     for(i=0;i<h;i++) {
2340         s += abs(pix1[0] - pix2[0]);
2341         s += abs(pix1[1] - pix2[1]);
2342         s += abs(pix1[2] - pix2[2]);
2343         s += abs(pix1[3] - pix2[3]);
2344         s += abs(pix1[4] - pix2[4]);
2345         s += abs(pix1[5] - pix2[5]);
2346         s += abs(pix1[6] - pix2[6]);
2347         s += abs(pix1[7] - pix2[7]);
2348         s += abs(pix1[8] - pix2[8]);
2349         s += abs(pix1[9] - pix2[9]);
2350         s += abs(pix1[10] - pix2[10]);
2351         s += abs(pix1[11] - pix2[11]);
2352         s += abs(pix1[12] - pix2[12]);
2353         s += abs(pix1[13] - pix2[13]);
2354         s += abs(pix1[14] - pix2[14]);
2355         s += abs(pix1[15] - pix2[15]);
2356         pix1 += line_size;
2357         pix2 += line_size;
2358     }
2359     return s;
2360 }
2361
2362 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2363 {
2364     int s, i;
2365
2366     s = 0;
2367     for(i=0;i<h;i++) {
2368         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2369         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2370         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2371         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2372         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2373         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2374         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2375         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2376         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2377         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2378         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2379         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2380         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2381         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2382         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2383         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2384         pix1 += line_size;
2385         pix2 += line_size;
2386     }
2387     return s;
2388 }
2389
2390 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2391 {
2392     int s, i;
2393     uint8_t *pix3 = pix2 + line_size;
2394
2395     s = 0;
2396     for(i=0;i<h;i++) {
2397         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2398         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2399         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2400         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2401         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2402         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2403         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2404         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2405         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2406         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2407         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2408         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2409         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2410         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2411         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2412         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2413         pix1 += line_size;
2414         pix2 += line_size;
2415         pix3 += line_size;
2416     }
2417     return s;
2418 }
2419
2420 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2421 {
2422     int s, i;
2423     uint8_t *pix3 = pix2 + line_size;
2424
2425     s = 0;
2426     for(i=0;i<h;i++) {
2427         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2428         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2429         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2430         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2431         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2432         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2433         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2434         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2435         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2436         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2437         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2438         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2439         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2440         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2441         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2442         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2443         pix1 += line_size;
2444         pix2 += line_size;
2445         pix3 += line_size;
2446     }
2447     return s;
2448 }
2449
2450 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2451 {
2452     int s, i;
2453
2454     s = 0;
2455     for(i=0;i<h;i++) {
2456         s += abs(pix1[0] - pix2[0]);
2457         s += abs(pix1[1] - pix2[1]);
2458         s += abs(pix1[2] - pix2[2]);
2459         s += abs(pix1[3] - pix2[3]);
2460         s += abs(pix1[4] - pix2[4]);
2461         s += abs(pix1[5] - pix2[5]);
2462         s += abs(pix1[6] - pix2[6]);
2463         s += abs(pix1[7] - pix2[7]);
2464         pix1 += line_size;
2465         pix2 += line_size;
2466     }
2467     return s;
2468 }
2469
2470 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2471 {
2472     int s, i;
2473
2474     s = 0;
2475     for(i=0;i<h;i++) {
2476         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2477         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2478         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2479         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2480         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2481         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2482         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2483         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2484         pix1 += line_size;
2485         pix2 += line_size;
2486     }
2487     return s;
2488 }
2489
2490 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2491 {
2492     int s, i;
2493     uint8_t *pix3 = pix2 + line_size;
2494
2495     s = 0;
2496     for(i=0;i<h;i++) {
2497         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2498         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2499         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2500         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2501         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2502         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2503         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2504         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2505         pix1 += line_size;
2506         pix2 += line_size;
2507         pix3 += line_size;
2508     }
2509     return s;
2510 }
2511
2512 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2513 {
2514     int s, i;
2515     uint8_t *pix3 = pix2 + line_size;
2516
2517     s = 0;
2518     for(i=0;i<h;i++) {
2519         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2520         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2521         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2522         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2523         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2524         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2525         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2526         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2527         pix1 += line_size;
2528         pix2 += line_size;
2529         pix3 += line_size;
2530     }
2531     return s;
2532 }
2533
2534 /**
2535  * permutes an 8x8 block.
2536  * @param block the block which will be permuted according to the given permutation vector
2537  * @param permutation the permutation vector
2538  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2539  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2540  *                  (inverse) permutated to scantable order!
2541  */
2542 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2543 {
2544     int i;
2545     DCTELEM temp[64];
2546
2547     if(last<=0) return;
2548     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2549
2550     for(i=0; i<=last; i++){
2551         const int j= scantable[i];
2552         temp[j]= block[j];
2553         block[j]=0;
2554     }
2555
2556     for(i=0; i<=last; i++){
2557         const int j= scantable[i];
2558         const int perm_j= permutation[j];
2559         block[perm_j]= temp[j];
2560     }
2561 }
2562
2563 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2564     return 0;
2565 }
2566
2567 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2568     int i;
2569
2570     memset(cmp, 0, sizeof(void*)*5);
2571
2572     for(i=0; i<5; i++){
2573         switch(type&0xFF){
2574         case FF_CMP_SAD:
2575             cmp[i]= c->sad[i];
2576             break;
2577         case FF_CMP_SATD:
2578             cmp[i]= c->hadamard8_diff[i];
2579             break;
2580         case FF_CMP_SSE:
2581             cmp[i]= c->sse[i];
2582             break;
2583         case FF_CMP_DCT:
2584             cmp[i]= c->dct_sad[i];
2585             break;
2586         case FF_CMP_PSNR:
2587             cmp[i]= c->quant_psnr[i];
2588             break;
2589         case FF_CMP_BIT:
2590             cmp[i]= c->bit[i];
2591             break;
2592         case FF_CMP_RD:
2593             cmp[i]= c->rd[i];
2594             break;
2595         case FF_CMP_VSAD:
2596             cmp[i]= c->vsad[i];
2597             break;
2598         case FF_CMP_VSSE:
2599             cmp[i]= c->vsse[i];
2600             break;
2601         case FF_CMP_ZERO:
2602             cmp[i]= zero_cmp;
2603             break;
2604         default:
2605             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2606         }
2607     }
2608 }
2609
2610 /**
2611  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2612  */
2613 static void clear_blocks_c(DCTELEM *blocks)
2614 {
2615     memset(blocks, 0, sizeof(DCTELEM)*6*64);
2616 }
2617
2618 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2619     int i;
2620     for(i=0; i+7<w; i+=8){
2621         dst[i+0] += src[i+0];
2622         dst[i+1] += src[i+1];
2623         dst[i+2] += src[i+2];
2624         dst[i+3] += src[i+3];
2625         dst[i+4] += src[i+4];
2626         dst[i+5] += src[i+5];
2627         dst[i+6] += src[i+6];
2628         dst[i+7] += src[i+7];
2629     }
2630     for(; i<w; i++)
2631         dst[i+0] += src[i+0];
2632 }
2633
2634 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2635     int i;
2636     for(i=0; i+7<w; i+=8){
2637         dst[i+0] = src1[i+0]-src2[i+0];
2638         dst[i+1] = src1[i+1]-src2[i+1];
2639         dst[i+2] = src1[i+2]-src2[i+2];
2640         dst[i+3] = src1[i+3]-src2[i+3];
2641         dst[i+4] = src1[i+4]-src2[i+4];
2642         dst[i+5] = src1[i+5]-src2[i+5];
2643         dst[i+6] = src1[i+6]-src2[i+6];
2644         dst[i+7] = src1[i+7]-src2[i+7];
2645     }
2646     for(; i<w; i++)
2647         dst[i+0] = src1[i+0]-src2[i+0];
2648 }
2649
2650 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2651     int i;
2652     uint8_t l, lt;
2653
2654     l= *left;
2655     lt= *left_top;
2656
2657     for(i=0; i<w; i++){
2658         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2659         lt= src1[i];
2660         l= src2[i];
2661         dst[i]= l - pred;
2662     }
2663
2664     *left= l;
2665     *left_top= lt;
2666 }
2667
2668 #define BUTTERFLY2(o1,o2,i1,i2) \
2669 o1= (i1)+(i2);\
2670 o2= (i1)-(i2);
2671
2672 #define BUTTERFLY1(x,y) \
2673 {\
2674     int a,b;\
2675     a= x;\
2676     b= y;\
2677     x= a+b;\
2678     y= a-b;\
2679 }
2680
2681 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2682
2683 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2684     int i;
2685     int temp[64];
2686     int sum=0;
2687
2688     assert(h==8);
2689
2690     for(i=0; i<8; i++){
2691         //FIXME try pointer walks
2692         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2693         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2694         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2695         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2696
2697         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2698         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2699         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2700         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2701
2702         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2703         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2704         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2705         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2706     }
2707
2708     for(i=0; i<8; i++){
2709         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2710         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2711         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2712         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2713
2714         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2715         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2716         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2717         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2718
2719         sum +=
2720              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2721             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2722             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2723             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2724     }
2725 #if 0
2726 static int maxi=0;
2727 if(sum>maxi){
2728     maxi=sum;
2729     printf("MAX:%d\n", maxi);
2730 }
2731 #endif
2732     return sum;
2733 }
2734
2735 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2736     int i;
2737     int temp[64];
2738     int sum=0;
2739
2740     assert(h==8);
2741
2742     for(i=0; i<8; i++){
2743         //FIXME try pointer walks
2744         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2745         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2746         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2747         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2748
2749         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2750         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2751         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2752         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2753
2754         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2755         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2756         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2757         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2758     }
2759
2760     for(i=0; i<8; i++){
2761         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2762         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2763         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2764         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2765
2766         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2767         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2768         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2769         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2770
2771         sum +=
2772              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2773             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2774             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2775             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2776     }
2777
2778     sum -= ABS(temp[8*0] + temp[8*4]); // -mean
2779
2780     return sum;
2781 }
2782
2783 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2784     MpegEncContext * const s= (MpegEncContext *)c;
2785     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2786     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2787     int sum=0, i;
2788
2789     assert(h==8);
2790
2791     s->dsp.diff_pixels(temp, src1, src2, stride);
2792     s->dsp.fdct(temp);
2793
2794     for(i=0; i<64; i++)
2795         sum+= ABS(temp[i]);
2796
2797     return sum;
2798 }
2799
2800 void simple_idct(DCTELEM *block); //FIXME
2801
2802 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2803     MpegEncContext * const s= (MpegEncContext *)c;
2804     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2805     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2806     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2807     int sum=0, i;
2808
2809     assert(h==8);
2810     s->mb_intra=0;
2811
2812     s->dsp.diff_pixels(temp, src1, src2, stride);
2813
2814     memcpy(bak, temp, 64*sizeof(DCTELEM));
2815
2816     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2817     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2818     simple_idct(temp); //FIXME
2819
2820     for(i=0; i<64; i++)
2821         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2822
2823     return sum;
2824 }
2825
2826 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2827     MpegEncContext * const s= (MpegEncContext *)c;
2828     const uint8_t *scantable= s->intra_scantable.permutated;
2829     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2830     uint64_t __align8 aligned_bak[stride];
2831     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2832     uint8_t * const bak= (uint8_t*)aligned_bak;
2833     int i, last, run, bits, level, distoration, start_i;
2834     const int esc_length= s->ac_esc_length;
2835     uint8_t * length;
2836     uint8_t * last_length;
2837
2838     assert(h==8);
2839
2840     for(i=0; i<8; i++){
2841         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2842         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2843     }
2844
2845     s->dsp.diff_pixels(temp, src1, src2, stride);
2846
2847     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2848
2849     bits=0;
2850
2851     if (s->mb_intra) {
2852         start_i = 1;
2853         length     = s->intra_ac_vlc_length;
2854         last_length= s->intra_ac_vlc_last_length;
2855         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2856     } else {
2857         start_i = 0;
2858         length     = s->inter_ac_vlc_length;
2859         last_length= s->inter_ac_vlc_last_length;
2860     }
2861
2862     if(last>=start_i){
2863         run=0;
2864         for(i=start_i; i<last; i++){
2865             int j= scantable[i];
2866             level= temp[j];
2867
2868             if(level){
2869                 level+=64;
2870                 if((level&(~127)) == 0){
2871                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2872                 }else
2873                     bits+= esc_length;
2874                 run=0;
2875             }else
2876                 run++;
2877         }
2878         i= scantable[last];
2879
2880         level= temp[i] + 64;
2881
2882         assert(level - 64);
2883
2884         if((level&(~127)) == 0){
2885             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2886         }else
2887             bits+= esc_length;
2888
2889     }
2890
2891     if(last>=0){
2892         if(s->mb_intra)
2893             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2894         else
2895             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2896     }
2897
2898     s->dsp.idct_add(bak, stride, temp);
2899
2900     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
2901
2902     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2903 }
2904
2905 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2906     MpegEncContext * const s= (MpegEncContext *)c;
2907     const uint8_t *scantable= s->intra_scantable.permutated;
2908     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2909     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2910     int i, last, run, bits, level, start_i;
2911     const int esc_length= s->ac_esc_length;
2912     uint8_t * length;
2913     uint8_t * last_length;
2914
2915     assert(h==8);
2916
2917     s->dsp.diff_pixels(temp, src1, src2, stride);
2918
2919     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2920
2921     bits=0;
2922
2923     if (s->mb_intra) {
2924         start_i = 1;
2925         length     = s->intra_ac_vlc_length;
2926         last_length= s->intra_ac_vlc_last_length;
2927         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2928     } else {
2929         start_i = 0;
2930         length     = s->inter_ac_vlc_length;
2931         last_length= s->inter_ac_vlc_last_length;
2932     }
2933
2934     if(last>=start_i){
2935         run=0;
2936         for(i=start_i; i<last; i++){
2937             int j= scantable[i];
2938             level= temp[j];
2939
2940             if(level){
2941                 level+=64;
2942                 if((level&(~127)) == 0){
2943                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2944                 }else
2945                     bits+= esc_length;
2946                 run=0;
2947             }else
2948                 run++;
2949         }
2950         i= scantable[last];
2951
2952         level= temp[i] + 64;
2953
2954         assert(level - 64);
2955
2956         if((level&(~127)) == 0){
2957             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2958         }else
2959             bits+= esc_length;
2960     }
2961
2962     return bits;
2963 }
2964
2965 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
2966     int score=0;
2967     int x,y;
2968
2969     for(y=1; y<h; y++){
2970         for(x=0; x<16; x+=4){
2971             score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride])
2972                    +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
2973         }
2974         s+= stride;
2975     }
2976
2977     return score;
2978 }
2979
2980 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2981     int score=0;
2982     int x,y;
2983
2984     for(y=1; y<h; y++){
2985         for(x=0; x<16; x++){
2986             score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2987         }
2988         s1+= stride;
2989         s2+= stride;
2990     }
2991
2992     return score;
2993 }
2994
2995 #define SQ(a) ((a)*(a))
2996 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
2997     int score=0;
2998     int x,y;
2999
3000     for(y=1; y<h; y++){
3001         for(x=0; x<16; x+=4){
3002             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3003                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3004         }
3005         s+= stride;
3006     }
3007
3008     return score;
3009 }
3010
3011 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3012     int score=0;
3013     int x,y;
3014
3015     for(y=1; y<h; y++){
3016         for(x=0; x<16; x++){
3017             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3018         }
3019         s1+= stride;
3020         s2+= stride;
3021     }
3022
3023     return score;
3024 }
3025
3026 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3027 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3028 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3029 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3030 WARPER8_16_SQ(rd8x8_c, rd16_c)
3031 WARPER8_16_SQ(bit8x8_c, bit16_c)
3032
3033 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3034  converted */
3035 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3036 {
3037     j_rev_dct (block);
3038     put_pixels_clamped_c(block, dest, line_size);
3039 }
3040 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3041 {
3042     j_rev_dct (block);
3043     add_pixels_clamped_c(block, dest, line_size);
3044 }
3045
3046 /* init static data */
3047 void dsputil_static_init(void)
3048 {
3049     int i;
3050
3051     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3052     for(i=0;i<MAX_NEG_CROP;i++) {
3053         cropTbl[i] = 0;
3054         cropTbl[i + MAX_NEG_CROP + 256] = 255;
3055     }
3056
3057     for(i=0;i<512;i++) {
3058         squareTbl[i] = (i - 256) * (i - 256);
3059     }
3060
3061     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3062 }
3063
3064
3065 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3066 {
3067     int i;
3068
3069 #ifdef CONFIG_ENCODERS
3070     if(avctx->dct_algo==FF_DCT_FASTINT) {
3071         c->fdct = fdct_ifast;
3072         c->fdct248 = fdct_ifast248;
3073     }
3074     else if(avctx->dct_algo==FF_DCT_FAAN) {
3075         c->fdct = ff_faandct;
3076         c->fdct248 = ff_faandct248;
3077     }
3078     else {
3079         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3080         c->fdct248 = ff_fdct248_islow;
3081     }
3082 #endif //CONFIG_ENCODERS
3083
3084     if(avctx->idct_algo==FF_IDCT_INT){
3085         c->idct_put= ff_jref_idct_put;
3086         c->idct_add= ff_jref_idct_add;
3087         c->idct    = j_rev_dct;
3088         c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3089     }else{ //accurate/default
3090         c->idct_put= simple_idct_put;
3091         c->idct_add= simple_idct_add;
3092         c->idct    = simple_idct;
3093         c->idct_permutation_type= FF_NO_IDCT_PERM;
3094     }
3095
3096     c->get_pixels = get_pixels_c;
3097     c->diff_pixels = diff_pixels_c;
3098     c->put_pixels_clamped = put_pixels_clamped_c;
3099     c->add_pixels_clamped = add_pixels_clamped_c;
3100     c->gmc1 = gmc1_c;
3101     c->gmc = gmc_c;
3102     c->clear_blocks = clear_blocks_c;
3103     c->pix_sum = pix_sum_c;
3104     c->pix_norm1 = pix_norm1_c;
3105
3106     /* TODO [0] 16  [1] 8 */
3107     c->pix_abs[0][0] = pix_abs16_c;
3108     c->pix_abs[0][1] = pix_abs16_x2_c;
3109     c->pix_abs[0][2] = pix_abs16_y2_c;
3110     c->pix_abs[0][3] = pix_abs16_xy2_c;
3111     c->pix_abs[1][0] = pix_abs8_c;
3112     c->pix_abs[1][1] = pix_abs8_x2_c;
3113     c->pix_abs[1][2] = pix_abs8_y2_c;
3114     c->pix_abs[1][3] = pix_abs8_xy2_c;
3115
3116 #define dspfunc(PFX, IDX, NUM) \
3117     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3118     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3119     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3120     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3121
3122     dspfunc(put, 0, 16);
3123     dspfunc(put_no_rnd, 0, 16);
3124     dspfunc(put, 1, 8);
3125     dspfunc(put_no_rnd, 1, 8);
3126     dspfunc(put, 2, 4);
3127     dspfunc(put, 3, 2);
3128
3129     dspfunc(avg, 0, 16);
3130     dspfunc(avg_no_rnd, 0, 16);
3131     dspfunc(avg, 1, 8);
3132     dspfunc(avg_no_rnd, 1, 8);
3133     dspfunc(avg, 2, 4);
3134     dspfunc(avg, 3, 2);
3135 #undef dspfunc
3136
3137     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3138     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3139     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3140     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3141     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3142     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3143     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3144     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3145     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3146
3147     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3148     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3149     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3150     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3151     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3152     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3153     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3154     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3155     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3156
3157 #define dspfunc(PFX, IDX, NUM) \
3158     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3159     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3160     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3161     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3162     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3163     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3164     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3165     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3166     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3167     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3168     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3169     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3170     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3171     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3172     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3173     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3174
3175     dspfunc(put_qpel, 0, 16);
3176     dspfunc(put_no_rnd_qpel, 0, 16);
3177
3178     dspfunc(avg_qpel, 0, 16);
3179     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3180
3181     dspfunc(put_qpel, 1, 8);
3182     dspfunc(put_no_rnd_qpel, 1, 8);
3183
3184     dspfunc(avg_qpel, 1, 8);
3185     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3186
3187     dspfunc(put_h264_qpel, 0, 16);
3188     dspfunc(put_h264_qpel, 1, 8);
3189     dspfunc(put_h264_qpel, 2, 4);
3190     dspfunc(avg_h264_qpel, 0, 16);
3191     dspfunc(avg_h264_qpel, 1, 8);
3192     dspfunc(avg_h264_qpel, 2, 4);
3193
3194 #undef dspfunc
3195     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3196     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3197     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3198     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3199     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3200     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3201
3202     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3203     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3204     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3205     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3206     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3207     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3208     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3209     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3210
3211 #define SET_CMP_FUNC(name) \
3212     c->name[0]= name ## 16_c;\
3213     c->name[1]= name ## 8x8_c;
3214
3215     SET_CMP_FUNC(hadamard8_diff)
3216     c->hadamard8_diff[4]= hadamard8_intra16_c;
3217     SET_CMP_FUNC(dct_sad)
3218     c->sad[0]= pix_abs16_c;
3219     c->sad[1]= pix_abs8_c;
3220     c->sse[0]= sse16_c;
3221     c->sse[1]= sse8_c;
3222     SET_CMP_FUNC(quant_psnr)
3223     SET_CMP_FUNC(rd)
3224     SET_CMP_FUNC(bit)
3225     c->vsad[0]= vsad16_c;
3226     c->vsad[4]= vsad_intra16_c;
3227     c->vsse[0]= vsse16_c;
3228     c->vsse[4]= vsse_intra16_c;
3229
3230     c->add_bytes= add_bytes_c;
3231     c->diff_bytes= diff_bytes_c;
3232     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3233     c->bswap_buf= bswap_buf;
3234
3235     c->h263_h_loop_filter= h263_h_loop_filter_c;
3236     c->h263_v_loop_filter= h263_v_loop_filter_c;
3237
3238 #ifdef HAVE_MMX
3239     dsputil_init_mmx(c, avctx);
3240 #endif
3241 #ifdef ARCH_ARMV4L
3242     dsputil_init_armv4l(c, avctx);
3243 #endif
3244 #ifdef HAVE_MLIB
3245     dsputil_init_mlib(c, avctx);
3246 #endif
3247 #ifdef ARCH_ALPHA
3248     dsputil_init_alpha(c, avctx);
3249 #endif
3250 #ifdef ARCH_POWERPC
3251     dsputil_init_ppc(c, avctx);
3252 #endif
3253 #ifdef HAVE_MMI
3254     dsputil_init_mmi(c, avctx);
3255 #endif
3256 #ifdef ARCH_SH4
3257     dsputil_init_sh4(c,avctx);
3258 #endif
3259
3260     switch(c->idct_permutation_type){
3261     case FF_NO_IDCT_PERM:
3262         for(i=0; i<64; i++)
3263             c->idct_permutation[i]= i;
3264         break;
3265     case FF_LIBMPEG2_IDCT_PERM:
3266         for(i=0; i<64; i++)
3267             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3268         break;
3269     case FF_SIMPLE_IDCT_PERM:
3270         for(i=0; i<64; i++)
3271             c->idct_permutation[i]= simple_mmx_permutation[i];
3272         break;
3273     case FF_TRANSPOSE_IDCT_PERM:
3274         for(i=0; i<64; i++)
3275             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3276         break;
3277     default:
3278         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3279     }
3280 }
3281