git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  *
  20  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  21  */
  22
  23 /**
  24  * @file dsputil.c
  25  * DSP utils
  26  */
  27
  28 #include "avcodec.h"
  29 #include "dsputil.h"
  30 #include "mpegvideo.h"
  31 #include "simple_idct.h"
  32 #include "faandct.h"
  33
  34 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  35 uint32_t squareTbl[512] = {0, };
  36
  37 const uint8_t ff_zigzag_direct[64] = {
  38     0,   1,  8, 16,  9,  2,  3, 10,
  39     17, 24, 32, 25, 18, 11,  4,  5,
  40     12, 19, 26, 33, 40, 48, 41, 34,
  41     27, 20, 13,  6,  7, 14, 21, 28,
  42     35, 42, 49, 56, 57, 50, 43, 36,
  43     29, 22, 15, 23, 30, 37, 44, 51,
  44     58, 59, 52, 45, 38, 31, 39, 46,
  45     53, 60, 61, 54, 47, 55, 62, 63
  46 };
  47
  48 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  49    specification, we interleave the fields */
  50 const uint8_t ff_zigzag248_direct[64] = {
  51      0,  8,  1,  9, 16, 24,  2, 10,
  52     17, 25, 32, 40, 48, 56, 33, 41,
  53     18, 26,  3, 11,  4, 12, 19, 27,
  54     34, 42, 49, 57, 50, 58, 35, 43,
  55     20, 28,  5, 13,  6, 14, 21, 29,
  56     36, 44, 51, 59, 52, 60, 37, 45,
  57     22, 30,  7, 15, 23, 31, 38, 46,
  58     53, 61, 54, 62, 39, 47, 55, 63,
  59 };
  60
  61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  62 uint16_t __align8 inv_zigzag_direct16[64] = {0, };
  63
  64 const uint8_t ff_alternate_horizontal_scan[64] = {
  65     0,  1,   2,  3,  8,  9, 16, 17,
  66     10, 11,  4,  5,  6,  7, 15, 14,
  67     13, 12, 19, 18, 24, 25, 32, 33,
  68     26, 27, 20, 21, 22, 23, 28, 29,
  69     30, 31, 34, 35, 40, 41, 48, 49,
  70     42, 43, 36, 37, 38, 39, 44, 45,
  71     46, 47, 50, 51, 56, 57, 58, 59,
  72     52, 53, 54, 55, 60, 61, 62, 63,
  73 };
  74
  75 const uint8_t ff_alternate_vertical_scan[64] = {
  76     0,  8,  16, 24,  1,  9,  2, 10,
  77     17, 25, 32, 40, 48, 56, 57, 49,
  78     41, 33, 26, 18,  3, 11,  4, 12,
  79     19, 27, 34, 42, 50, 58, 35, 43,
  80     51, 59, 20, 28,  5, 13,  6, 14,
  81     21, 29, 36, 44, 52, 60, 37, 45,
  82     53, 61, 22, 30,  7, 15, 23, 31,
  83     38, 46, 54, 62, 39, 47, 55, 63,
  84 };
  85
  86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  87 const uint32_t inverse[256]={
  88          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  89  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
  90  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
  91  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
  92  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
  93  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
  94   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
  95   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
  96   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
  97   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
  98   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
  99   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 100   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 101   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 102   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 103   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 104   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 105   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 106   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 107   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 108   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 109   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 110   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 111   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 112   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 113   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 114   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 115   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 116   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 117   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 118   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 119   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 120 };
 121
 122 /* Input permutation for the simple_idct_mmx */
 123 static const uint8_t simple_mmx_permutation[64]={
 124         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 125         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 126         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 127         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 128         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 129         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 130         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 131         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 132 };
 133
 134 static int pix_sum_c(uint8_t * pix, int line_size)
 135 {
 136     int s, i, j;
 137
 138     s = 0;
 139     for (i = 0; i < 16; i++) {
 140         for (j = 0; j < 16; j += 8) {
 141             s += pix[0];
 142             s += pix[1];
 143             s += pix[2];
 144             s += pix[3];
 145             s += pix[4];
 146             s += pix[5];
 147             s += pix[6];
 148             s += pix[7];
 149             pix += 8;
 150         }
 151         pix += line_size - 16;
 152     }
 153     return s;
 154 }
 155
 156 static int pix_norm1_c(uint8_t * pix, int line_size)
 157 {
 158     int s, i, j;
 159     uint32_t *sq = squareTbl + 256;
 160
 161     s = 0;
 162     for (i = 0; i < 16; i++) {
 163         for (j = 0; j < 16; j += 8) {
 164 #if 0
 165             s += sq[pix[0]];
 166             s += sq[pix[1]];
 167             s += sq[pix[2]];
 168             s += sq[pix[3]];
 169             s += sq[pix[4]];
 170             s += sq[pix[5]];
 171             s += sq[pix[6]];
 172             s += sq[pix[7]];
 173 #else
 174 #if LONG_MAX > 2147483647
 175             register uint64_t x=*(uint64_t*)pix;
 176             s += sq[x&0xff];
 177             s += sq[(x>>8)&0xff];
 178             s += sq[(x>>16)&0xff];
 179             s += sq[(x>>24)&0xff];
 180             s += sq[(x>>32)&0xff];
 181             s += sq[(x>>40)&0xff];
 182             s += sq[(x>>48)&0xff];
 183             s += sq[(x>>56)&0xff];
 184 #else
 185             register uint32_t x=*(uint32_t*)pix;
 186             s += sq[x&0xff];
 187             s += sq[(x>>8)&0xff];
 188             s += sq[(x>>16)&0xff];
 189             s += sq[(x>>24)&0xff];
 190             x=*(uint32_t*)(pix+4);
 191             s += sq[x&0xff];
 192             s += sq[(x>>8)&0xff];
 193             s += sq[(x>>16)&0xff];
 194             s += sq[(x>>24)&0xff];
 195 #endif
 196 #endif
 197             pix += 8;
 198         }
 199         pix += line_size - 16;
 200     }
 201     return s;
 202 }
 203
 204 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 205     int i;
 206
 207     for(i=0; i+8<=w; i+=8){
 208         dst[i+0]= bswap_32(src[i+0]);
 209         dst[i+1]= bswap_32(src[i+1]);
 210         dst[i+2]= bswap_32(src[i+2]);
 211         dst[i+3]= bswap_32(src[i+3]);
 212         dst[i+4]= bswap_32(src[i+4]);
 213         dst[i+5]= bswap_32(src[i+5]);
 214         dst[i+6]= bswap_32(src[i+6]);
 215         dst[i+7]= bswap_32(src[i+7]);
 216     }
 217     for(;i<w; i++){
 218         dst[i+0]= bswap_32(src[i+0]);
 219     }
 220 }
 221
 222 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 223 {
 224     int s, i;
 225     uint32_t *sq = squareTbl + 256;
 226
 227     s = 0;
 228     for (i = 0; i < h; i++) {
 229         s += sq[pix1[0] - pix2[0]];
 230         s += sq[pix1[1] - pix2[1]];
 231         s += sq[pix1[2] - pix2[2]];
 232         s += sq[pix1[3] - pix2[3]];
 233         pix1 += line_size;
 234         pix2 += line_size;
 235     }
 236     return s;
 237 }
 238
 239 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 240 {
 241     int s, i;
 242     uint32_t *sq = squareTbl + 256;
 243
 244     s = 0;
 245     for (i = 0; i < h; i++) {
 246         s += sq[pix1[0] - pix2[0]];
 247         s += sq[pix1[1] - pix2[1]];
 248         s += sq[pix1[2] - pix2[2]];
 249         s += sq[pix1[3] - pix2[3]];
 250         s += sq[pix1[4] - pix2[4]];
 251         s += sq[pix1[5] - pix2[5]];
 252         s += sq[pix1[6] - pix2[6]];
 253         s += sq[pix1[7] - pix2[7]];
 254         pix1 += line_size;
 255         pix2 += line_size;
 256     }
 257     return s;
 258 }
 259
 260 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 261 {
 262     int s, i;
 263     uint32_t *sq = squareTbl + 256;
 264
 265     s = 0;
 266     for (i = 0; i < h; i++) {
 267         s += sq[pix1[ 0] - pix2[ 0]];
 268         s += sq[pix1[ 1] - pix2[ 1]];
 269         s += sq[pix1[ 2] - pix2[ 2]];
 270         s += sq[pix1[ 3] - pix2[ 3]];
 271         s += sq[pix1[ 4] - pix2[ 4]];
 272         s += sq[pix1[ 5] - pix2[ 5]];
 273         s += sq[pix1[ 6] - pix2[ 6]];
 274         s += sq[pix1[ 7] - pix2[ 7]];
 275         s += sq[pix1[ 8] - pix2[ 8]];
 276         s += sq[pix1[ 9] - pix2[ 9]];
 277         s += sq[pix1[10] - pix2[10]];
 278         s += sq[pix1[11] - pix2[11]];
 279         s += sq[pix1[12] - pix2[12]];
 280         s += sq[pix1[13] - pix2[13]];
 281         s += sq[pix1[14] - pix2[14]];
 282         s += sq[pix1[15] - pix2[15]];
 283
 284         pix1 += line_size;
 285         pix2 += line_size;
 286     }
 287     return s;
 288 }
 289
 290
 291 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 292     int s, i, j;
 293     const int dec_count= w==8 ? 3 : 4;
 294     int tmp[16*16];
 295 #if 0
 296     int level, ori;
 297     static const int scale[2][2][4][4]={
 298       {
 299         {
 300             //8x8 dec=3
 301             {268, 239, 239, 213},
 302             {  0, 224, 224, 152},
 303             {  0, 135, 135, 110},
 304         },{
 305             //16x16 dec=4
 306             {344, 310, 310, 280},
 307             {  0, 320, 320, 228},
 308             {  0, 175, 175, 136},
 309             {  0, 129, 129, 102},
 310         }
 311       },{
 312         {//FIXME 5/3
 313             //8x8 dec=3
 314             {275, 245, 245, 218},
 315             {  0, 230, 230, 156},
 316             {  0, 138, 138, 113},
 317         },{
 318             //16x16 dec=4
 319             {352, 317, 317, 286},
 320             {  0, 328, 328, 233},
 321             {  0, 180, 180, 140},
 322             {  0, 132, 132, 105},
 323         }
 324       }
 325     };
 326 #endif
 327
 328     for (i = 0; i < h; i++) {
 329         for (j = 0; j < w; j+=4) {
 330             tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 331             tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 332             tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 333             tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 334         }
 335         pix1 += line_size;
 336         pix2 += line_size;
 337     }
 338     ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
 339
 340     s=0;
 341 #if 0
 342     for(level=0; level<dec_count; level++){
 343         for(ori= level ? 1 : 0; ori<4; ori++){
 344             int sx= (ori&1) ? 1<<level: 0;
 345             int stride= 16<<(dec_count-level);
 346             int sy= (ori&2) ? stride>>1 : 0;
 347             int size= 1<<level;
 348
 349             for(i=0; i<size; i++){
 350                 for(j=0; j<size; j++){
 351                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 352                     s += ABS(v);
 353                 }
 354             }
 355         }
 356     }
 357 #endif
 358     for (i = 0; i < h; i++) {
 359         for (j = 0; j < w; j+=4) {
 360             s+= ABS(tmp[16*i+j+0]);
 361             s+= ABS(tmp[16*i+j+1]);
 362             s+= ABS(tmp[16*i+j+2]);
 363             s+= ABS(tmp[16*i+j+3]);
 364         }
 365     }
 366     assert(s>=0);
 367
 368     return s>>2;
 369 }
 370
 371 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 372     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 373 }
 374
 375 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 376     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 377 }
 378
 379 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 380     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 381 }
 382
 383 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 384     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 385 }
 386
 387 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 388 {
 389     int i;
 390
 391     /* read the pixels */
 392     for(i=0;i<8;i++) {
 393         block[0] = pixels[0];
 394         block[1] = pixels[1];
 395         block[2] = pixels[2];
 396         block[3] = pixels[3];
 397         block[4] = pixels[4];
 398         block[5] = pixels[5];
 399         block[6] = pixels[6];
 400         block[7] = pixels[7];
 401         pixels += line_size;
 402         block += 8;
 403     }
 404 }
 405
 406 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 407                           const uint8_t *s2, int stride){
 408     int i;
 409
 410     /* read the pixels */
 411     for(i=0;i<8;i++) {
 412         block[0] = s1[0] - s2[0];
 413         block[1] = s1[1] - s2[1];
 414         block[2] = s1[2] - s2[2];
 415         block[3] = s1[3] - s2[3];
 416         block[4] = s1[4] - s2[4];
 417         block[5] = s1[5] - s2[5];
 418         block[6] = s1[6] - s2[6];
 419         block[7] = s1[7] - s2[7];
 420         s1 += stride;
 421         s2 += stride;
 422         block += 8;
 423     }
 424 }
 425
 426
 427 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 428                                  int line_size)
 429 {
 430     int i;
 431     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 432
 433     /* read the pixels */
 434     for(i=0;i<8;i++) {
 435         pixels[0] = cm[block[0]];
 436         pixels[1] = cm[block[1]];
 437         pixels[2] = cm[block[2]];
 438         pixels[3] = cm[block[3]];
 439         pixels[4] = cm[block[4]];
 440         pixels[5] = cm[block[5]];
 441         pixels[6] = cm[block[6]];
 442         pixels[7] = cm[block[7]];
 443
 444         pixels += line_size;
 445         block += 8;
 446     }
 447 }
 448
 449 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 450                                  int line_size)
 451 {
 452     int i;
 453     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 454
 455     /* read the pixels */
 456     for(i=0;i<4;i++) {
 457         pixels[0] = cm[block[0]];
 458         pixels[1] = cm[block[1]];
 459         pixels[2] = cm[block[2]];
 460         pixels[3] = cm[block[3]];
 461
 462         pixels += line_size;
 463         block += 8;
 464     }
 465 }
 466
 467 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 468                                         uint8_t *restrict pixels,
 469                                         int line_size)
 470 {
 471     int i, j;
 472
 473     for (i = 0; i < 8; i++) {
 474         for (j = 0; j < 8; j++) {
 475             if (*block < -128)
 476                 *pixels = 0;
 477             else if (*block > 127)
 478                 *pixels = 255;
 479             else
 480                 *pixels = (uint8_t)(*block + 128);
 481             block++;
 482             pixels++;
 483         }
 484         pixels += (line_size - 8);
 485     }
 486 }
 487
 488 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 489                           int line_size)
 490 {
 491     int i;
 492     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 493
 494     /* read the pixels */
 495     for(i=0;i<8;i++) {
 496         pixels[0] = cm[pixels[0] + block[0]];
 497         pixels[1] = cm[pixels[1] + block[1]];
 498         pixels[2] = cm[pixels[2] + block[2]];
 499         pixels[3] = cm[pixels[3] + block[3]];
 500         pixels[4] = cm[pixels[4] + block[4]];
 501         pixels[5] = cm[pixels[5] + block[5]];
 502         pixels[6] = cm[pixels[6] + block[6]];
 503         pixels[7] = cm[pixels[7] + block[7]];
 504         pixels += line_size;
 505         block += 8;
 506     }
 507 }
 508
 509 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 510                           int line_size)
 511 {
 512     int i;
 513     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 514
 515     /* read the pixels */
 516     for(i=0;i<4;i++) {
 517         pixels[0] = cm[pixels[0] + block[0]];
 518         pixels[1] = cm[pixels[1] + block[1]];
 519         pixels[2] = cm[pixels[2] + block[2]];
 520         pixels[3] = cm[pixels[3] + block[3]];
 521         pixels += line_size;
 522         block += 8;
 523     }
 524 }
 525 #if 0
 526
 527 #define PIXOP2(OPNAME, OP) \
 528 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 529 {\
 530     int i;\
 531     for(i=0; i<h; i++){\
 532         OP(*((uint64_t*)block), LD64(pixels));\
 533         pixels+=line_size;\
 534         block +=line_size;\
 535     }\
 536 }\
 537 \
 538 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 539 {\
 540     int i;\
 541     for(i=0; i<h; i++){\
 542         const uint64_t a= LD64(pixels  );\
 543         const uint64_t b= LD64(pixels+1);\
 544         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 545         pixels+=line_size;\
 546         block +=line_size;\
 547     }\
 548 }\
 549 \
 550 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 551 {\
 552     int i;\
 553     for(i=0; i<h; i++){\
 554         const uint64_t a= LD64(pixels  );\
 555         const uint64_t b= LD64(pixels+1);\
 556         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 557         pixels+=line_size;\
 558         block +=line_size;\
 559     }\
 560 }\
 561 \
 562 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 563 {\
 564     int i;\
 565     for(i=0; i<h; i++){\
 566         const uint64_t a= LD64(pixels          );\
 567         const uint64_t b= LD64(pixels+line_size);\
 568         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 569         pixels+=line_size;\
 570         block +=line_size;\
 571     }\
 572 }\
 573 \
 574 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 575 {\
 576     int i;\
 577     for(i=0; i<h; i++){\
 578         const uint64_t a= LD64(pixels          );\
 579         const uint64_t b= LD64(pixels+line_size);\
 580         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 581         pixels+=line_size;\
 582         block +=line_size;\
 583     }\
 584 }\
 585 \
 586 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 587 {\
 588         int i;\
 589         const uint64_t a= LD64(pixels  );\
 590         const uint64_t b= LD64(pixels+1);\
 591         uint64_t l0=  (a&0x0303030303030303ULL)\
 592                     + (b&0x0303030303030303ULL)\
 593                     + 0x0202020202020202ULL;\
 594         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 595                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 596         uint64_t l1,h1;\
 597 \
 598         pixels+=line_size;\
 599         for(i=0; i<h; i+=2){\
 600             uint64_t a= LD64(pixels  );\
 601             uint64_t b= LD64(pixels+1);\
 602             l1=  (a&0x0303030303030303ULL)\
 603                + (b&0x0303030303030303ULL);\
 604             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 605               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 606             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 607             pixels+=line_size;\
 608             block +=line_size;\
 609             a= LD64(pixels  );\
 610             b= LD64(pixels+1);\
 611             l0=  (a&0x0303030303030303ULL)\
 612                + (b&0x0303030303030303ULL)\
 613                + 0x0202020202020202ULL;\
 614             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 615               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 616             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 617             pixels+=line_size;\
 618             block +=line_size;\
 619         }\
 620 }\
 621 \
 622 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 623 {\
 624         int i;\
 625         const uint64_t a= LD64(pixels  );\
 626         const uint64_t b= LD64(pixels+1);\
 627         uint64_t l0=  (a&0x0303030303030303ULL)\
 628                     + (b&0x0303030303030303ULL)\
 629                     + 0x0101010101010101ULL;\
 630         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 631                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 632         uint64_t l1,h1;\
 633 \
 634         pixels+=line_size;\
 635         for(i=0; i<h; i+=2){\
 636             uint64_t a= LD64(pixels  );\
 637             uint64_t b= LD64(pixels+1);\
 638             l1=  (a&0x0303030303030303ULL)\
 639                + (b&0x0303030303030303ULL);\
 640             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 641               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 642             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 643             pixels+=line_size;\
 644             block +=line_size;\
 645             a= LD64(pixels  );\
 646             b= LD64(pixels+1);\
 647             l0=  (a&0x0303030303030303ULL)\
 648                + (b&0x0303030303030303ULL)\
 649                + 0x0101010101010101ULL;\
 650             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 651               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 652             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 653             pixels+=line_size;\
 654             block +=line_size;\
 655         }\
 656 }\
 657 \
 658 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 659 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 660 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 661 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 662 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 663 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 664 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 665
 666 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 667 #else // 64 bit variant
 668
 669 #define PIXOP2(OPNAME, OP) \
 670 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 671     int i;\
 672     for(i=0; i<h; i++){\
 673         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
 674         pixels+=line_size;\
 675         block +=line_size;\
 676     }\
 677 }\
 678 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 679     int i;\
 680     for(i=0; i<h; i++){\
 681         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 682         pixels+=line_size;\
 683         block +=line_size;\
 684     }\
 685 }\
 686 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 687     int i;\
 688     for(i=0; i<h; i++){\
 689         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 690         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 691         pixels+=line_size;\
 692         block +=line_size;\
 693     }\
 694 }\
 695 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 696     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 697 }\
 698 \
 699 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 700                                                 int src_stride1, int src_stride2, int h){\
 701     int i;\
 702     for(i=0; i<h; i++){\
 703         uint32_t a,b;\
 704         a= LD32(&src1[i*src_stride1  ]);\
 705         b= LD32(&src2[i*src_stride2  ]);\
 706         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 707         a= LD32(&src1[i*src_stride1+4]);\
 708         b= LD32(&src2[i*src_stride2+4]);\
 709         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 710     }\
 711 }\
 712 \
 713 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 714                                                 int src_stride1, int src_stride2, int h){\
 715     int i;\
 716     for(i=0; i<h; i++){\
 717         uint32_t a,b;\
 718         a= LD32(&src1[i*src_stride1  ]);\
 719         b= LD32(&src2[i*src_stride2  ]);\
 720         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 721         a= LD32(&src1[i*src_stride1+4]);\
 722         b= LD32(&src2[i*src_stride2+4]);\
 723         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 724     }\
 725 }\
 726 \
 727 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 728                                                 int src_stride1, int src_stride2, int h){\
 729     int i;\
 730     for(i=0; i<h; i++){\
 731         uint32_t a,b;\
 732         a= LD32(&src1[i*src_stride1  ]);\
 733         b= LD32(&src2[i*src_stride2  ]);\
 734         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 735     }\
 736 }\
 737 \
 738 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 739                                                 int src_stride1, int src_stride2, int h){\
 740     int i;\
 741     for(i=0; i<h; i++){\
 742         uint32_t a,b;\
 743         a= LD16(&src1[i*src_stride1  ]);\
 744         b= LD16(&src2[i*src_stride2  ]);\
 745         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 746     }\
 747 }\
 748 \
 749 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 750                                                 int src_stride1, int src_stride2, int h){\
 751     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 752     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 753 }\
 754 \
 755 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 756                                                 int src_stride1, int src_stride2, int h){\
 757     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 758     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 759 }\
 760 \
 761 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 762     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 763 }\
 764 \
 765 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 766     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 767 }\
 768 \
 769 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 770     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 771 }\
 772 \
 773 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 774     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 775 }\
 776 \
 777 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 778                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 779     int i;\
 780     for(i=0; i<h; i++){\
 781         uint32_t a, b, c, d, l0, l1, h0, h1;\
 782         a= LD32(&src1[i*src_stride1]);\
 783         b= LD32(&src2[i*src_stride2]);\
 784         c= LD32(&src3[i*src_stride3]);\
 785         d= LD32(&src4[i*src_stride4]);\
 786         l0=  (a&0x03030303UL)\
 787            + (b&0x03030303UL)\
 788            + 0x02020202UL;\
 789         h0= ((a&0xFCFCFCFCUL)>>2)\
 790           + ((b&0xFCFCFCFCUL)>>2);\
 791         l1=  (c&0x03030303UL)\
 792            + (d&0x03030303UL);\
 793         h1= ((c&0xFCFCFCFCUL)>>2)\
 794           + ((d&0xFCFCFCFCUL)>>2);\
 795         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 796         a= LD32(&src1[i*src_stride1+4]);\
 797         b= LD32(&src2[i*src_stride2+4]);\
 798         c= LD32(&src3[i*src_stride3+4]);\
 799         d= LD32(&src4[i*src_stride4+4]);\
 800         l0=  (a&0x03030303UL)\
 801            + (b&0x03030303UL)\
 802            + 0x02020202UL;\
 803         h0= ((a&0xFCFCFCFCUL)>>2)\
 804           + ((b&0xFCFCFCFCUL)>>2);\
 805         l1=  (c&0x03030303UL)\
 806            + (d&0x03030303UL);\
 807         h1= ((c&0xFCFCFCFCUL)>>2)\
 808           + ((d&0xFCFCFCFCUL)>>2);\
 809         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 810     }\
 811 }\
 812 \
 813 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 814     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 815 }\
 816 \
 817 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 818     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 819 }\
 820 \
 821 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 822     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 823 }\
 824 \
 825 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 826     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 827 }\
 828 \
 829 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 830                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 831     int i;\
 832     for(i=0; i<h; i++){\
 833         uint32_t a, b, c, d, l0, l1, h0, h1;\
 834         a= LD32(&src1[i*src_stride1]);\
 835         b= LD32(&src2[i*src_stride2]);\
 836         c= LD32(&src3[i*src_stride3]);\
 837         d= LD32(&src4[i*src_stride4]);\
 838         l0=  (a&0x03030303UL)\
 839            + (b&0x03030303UL)\
 840            + 0x01010101UL;\
 841         h0= ((a&0xFCFCFCFCUL)>>2)\
 842           + ((b&0xFCFCFCFCUL)>>2);\
 843         l1=  (c&0x03030303UL)\
 844            + (d&0x03030303UL);\
 845         h1= ((c&0xFCFCFCFCUL)>>2)\
 846           + ((d&0xFCFCFCFCUL)>>2);\
 847         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 848         a= LD32(&src1[i*src_stride1+4]);\
 849         b= LD32(&src2[i*src_stride2+4]);\
 850         c= LD32(&src3[i*src_stride3+4]);\
 851         d= LD32(&src4[i*src_stride4+4]);\
 852         l0=  (a&0x03030303UL)\
 853            + (b&0x03030303UL)\
 854            + 0x01010101UL;\
 855         h0= ((a&0xFCFCFCFCUL)>>2)\
 856           + ((b&0xFCFCFCFCUL)>>2);\
 857         l1=  (c&0x03030303UL)\
 858            + (d&0x03030303UL);\
 859         h1= ((c&0xFCFCFCFCUL)>>2)\
 860           + ((d&0xFCFCFCFCUL)>>2);\
 861         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 862     }\
 863 }\
 864 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 865                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 866     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 867     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 868 }\
 869 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 870                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 871     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 872     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 873 }\
 874 \
 875 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 876 {\
 877         int i, a0, b0, a1, b1;\
 878         a0= pixels[0];\
 879         b0= pixels[1] + 2;\
 880         a0 += b0;\
 881         b0 += pixels[2];\
 882 \
 883         pixels+=line_size;\
 884         for(i=0; i<h; i+=2){\
 885             a1= pixels[0];\
 886             b1= pixels[1];\
 887             a1 += b1;\
 888             b1 += pixels[2];\
 889 \
 890             block[0]= (a1+a0)>>2; /* FIXME non put */\
 891             block[1]= (b1+b0)>>2;\
 892 \
 893             pixels+=line_size;\
 894             block +=line_size;\
 895 \
 896             a0= pixels[0];\
 897             b0= pixels[1] + 2;\
 898             a0 += b0;\
 899             b0 += pixels[2];\
 900 \
 901             block[0]= (a1+a0)>>2;\
 902             block[1]= (b1+b0)>>2;\
 903             pixels+=line_size;\
 904             block +=line_size;\
 905         }\
 906 }\
 907 \
 908 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 909 {\
 910         int i;\
 911         const uint32_t a= LD32(pixels  );\
 912         const uint32_t b= LD32(pixels+1);\
 913         uint32_t l0=  (a&0x03030303UL)\
 914                     + (b&0x03030303UL)\
 915                     + 0x02020202UL;\
 916         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 917                    + ((b&0xFCFCFCFCUL)>>2);\
 918         uint32_t l1,h1;\
 919 \
 920         pixels+=line_size;\
 921         for(i=0; i<h; i+=2){\
 922             uint32_t a= LD32(pixels  );\
 923             uint32_t b= LD32(pixels+1);\
 924             l1=  (a&0x03030303UL)\
 925                + (b&0x03030303UL);\
 926             h1= ((a&0xFCFCFCFCUL)>>2)\
 927               + ((b&0xFCFCFCFCUL)>>2);\
 928             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 929             pixels+=line_size;\
 930             block +=line_size;\
 931             a= LD32(pixels  );\
 932             b= LD32(pixels+1);\
 933             l0=  (a&0x03030303UL)\
 934                + (b&0x03030303UL)\
 935                + 0x02020202UL;\
 936             h0= ((a&0xFCFCFCFCUL)>>2)\
 937               + ((b&0xFCFCFCFCUL)>>2);\
 938             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 939             pixels+=line_size;\
 940             block +=line_size;\
 941         }\
 942 }\
 943 \
 944 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 945 {\
 946     int j;\
 947     for(j=0; j<2; j++){\
 948         int i;\
 949         const uint32_t a= LD32(pixels  );\
 950         const uint32_t b= LD32(pixels+1);\
 951         uint32_t l0=  (a&0x03030303UL)\
 952                     + (b&0x03030303UL)\
 953                     + 0x02020202UL;\
 954         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 955                    + ((b&0xFCFCFCFCUL)>>2);\
 956         uint32_t l1,h1;\
 957 \
 958         pixels+=line_size;\
 959         for(i=0; i<h; i+=2){\
 960             uint32_t a= LD32(pixels  );\
 961             uint32_t b= LD32(pixels+1);\
 962             l1=  (a&0x03030303UL)\
 963                + (b&0x03030303UL);\
 964             h1= ((a&0xFCFCFCFCUL)>>2)\
 965               + ((b&0xFCFCFCFCUL)>>2);\
 966             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 967             pixels+=line_size;\
 968             block +=line_size;\
 969             a= LD32(pixels  );\
 970             b= LD32(pixels+1);\
 971             l0=  (a&0x03030303UL)\
 972                + (b&0x03030303UL)\
 973                + 0x02020202UL;\
 974             h0= ((a&0xFCFCFCFCUL)>>2)\
 975               + ((b&0xFCFCFCFCUL)>>2);\
 976             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 977             pixels+=line_size;\
 978             block +=line_size;\
 979         }\
 980         pixels+=4-line_size*(h+1);\
 981         block +=4-line_size*h;\
 982     }\
 983 }\
 984 \
 985 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 986 {\
 987     int j;\
 988     for(j=0; j<2; j++){\
 989         int i;\
 990         const uint32_t a= LD32(pixels  );\
 991         const uint32_t b= LD32(pixels+1);\
 992         uint32_t l0=  (a&0x03030303UL)\
 993                     + (b&0x03030303UL)\
 994                     + 0x01010101UL;\
 995         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 996                    + ((b&0xFCFCFCFCUL)>>2);\
 997         uint32_t l1,h1;\
 998 \
 999         pixels+=line_size;\
1000         for(i=0; i<h; i+=2){\
1001             uint32_t a= LD32(pixels  );\
1002             uint32_t b= LD32(pixels+1);\
1003             l1=  (a&0x03030303UL)\
1004                + (b&0x03030303UL);\
1005             h1= ((a&0xFCFCFCFCUL)>>2)\
1006               + ((b&0xFCFCFCFCUL)>>2);\
1007             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1008             pixels+=line_size;\
1009             block +=line_size;\
1010             a= LD32(pixels  );\
1011             b= LD32(pixels+1);\
1012             l0=  (a&0x03030303UL)\
1013                + (b&0x03030303UL)\
1014                + 0x01010101UL;\
1015             h0= ((a&0xFCFCFCFCUL)>>2)\
1016               + ((b&0xFCFCFCFCUL)>>2);\
1017             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1018             pixels+=line_size;\
1019             block +=line_size;\
1020         }\
1021         pixels+=4-line_size*(h+1);\
1022         block +=4-line_size*h;\
1023     }\
1024 }\
1025 \
1026 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1027 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1028 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1029 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1030 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1031 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1032 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1033 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1034
1035 #define op_avg(a, b) a = rnd_avg32(a, b)
1036 #endif
1037 #define op_put(a, b) a = b
1038
1039 PIXOP2(avg, op_avg)
1040 PIXOP2(put, op_put)
1041 #undef op_avg
1042 #undef op_put
1043
1044 #define avg2(a,b) ((a+b+1)>>1)
1045 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1046
1047 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1048     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1049 }
1050
1051 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1052     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1053 }
1054
1055 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1056 {
1057     const int A=(16-x16)*(16-y16);
1058     const int B=(   x16)*(16-y16);
1059     const int C=(16-x16)*(   y16);
1060     const int D=(   x16)*(   y16);
1061     int i;
1062
1063     for(i=0; i<h; i++)
1064     {
1065         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1066         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1067         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1068         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1069         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1070         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1071         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1072         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1073         dst+= stride;
1074         src+= stride;
1075     }
1076 }
1077
1078 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1079                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1080 {
1081     int y, vx, vy;
1082     const int s= 1<<shift;
1083
1084     width--;
1085     height--;
1086
1087     for(y=0; y<h; y++){
1088         int x;
1089
1090         vx= ox;
1091         vy= oy;
1092         for(x=0; x<8; x++){ //XXX FIXME optimize
1093             int src_x, src_y, frac_x, frac_y, index;
1094
1095             src_x= vx>>16;
1096             src_y= vy>>16;
1097             frac_x= src_x&(s-1);
1098             frac_y= src_y&(s-1);
1099             src_x>>=shift;
1100             src_y>>=shift;
1101
1102             if((unsigned)src_x < width){
1103                 if((unsigned)src_y < height){
1104                     index= src_x + src_y*stride;
1105                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1106                                            + src[index       +1]*   frac_x )*(s-frac_y)
1107                                         + (  src[index+stride  ]*(s-frac_x)
1108                                            + src[index+stride+1]*   frac_x )*   frac_y
1109                                         + r)>>(shift*2);
1110                 }else{
1111                     index= src_x + clip(src_y, 0, height)*stride;
1112                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1113                                           + src[index       +1]*   frac_x )*s
1114                                         + r)>>(shift*2);
1115                 }
1116             }else{
1117                 if((unsigned)src_y < height){
1118                     index= clip(src_x, 0, width) + src_y*stride;
1119                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1120                                            + src[index+stride  ]*   frac_y )*s
1121                                         + r)>>(shift*2);
1122                 }else{
1123                     index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1124                     dst[y*stride + x]=    src[index         ];
1125                 }
1126             }
1127
1128             vx+= dxx;
1129             vy+= dyx;
1130         }
1131         ox += dxy;
1132         oy += dyy;
1133     }
1134 }
1135
1136 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1137     switch(width){
1138     case 2: put_pixels2_c (dst, src, stride, height); break;
1139     case 4: put_pixels4_c (dst, src, stride, height); break;
1140     case 8: put_pixels8_c (dst, src, stride, height); break;
1141     case 16:put_pixels16_c(dst, src, stride, height); break;
1142     }
1143 }
1144
1145 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1146     int i,j;
1147     for (i=0; i < height; i++) {
1148       for (j=0; j < width; j++) {
1149         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1150       }
1151       src += stride;
1152       dst += stride;
1153     }
1154 }
1155
1156 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1157     int i,j;
1158     for (i=0; i < height; i++) {
1159       for (j=0; j < width; j++) {
1160         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1161       }
1162       src += stride;
1163       dst += stride;
1164     }
1165 }
1166
1167 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1168     int i,j;
1169     for (i=0; i < height; i++) {
1170       for (j=0; j < width; j++) {
1171         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1172       }
1173       src += stride;
1174       dst += stride;
1175     }
1176 }
1177
1178 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1179     int i,j;
1180     for (i=0; i < height; i++) {
1181       for (j=0; j < width; j++) {
1182         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1183       }
1184       src += stride;
1185       dst += stride;
1186     }
1187 }
1188
1189 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1190     int i,j;
1191     for (i=0; i < height; i++) {
1192       for (j=0; j < width; j++) {
1193         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1194       }
1195       src += stride;
1196       dst += stride;
1197     }
1198 }
1199
1200 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1201     int i,j;
1202     for (i=0; i < height; i++) {
1203       for (j=0; j < width; j++) {
1204         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1205       }
1206       src += stride;
1207       dst += stride;
1208     }
1209 }
1210
1211 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1212     int i,j;
1213     for (i=0; i < height; i++) {
1214       for (j=0; j < width; j++) {
1215         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1216       }
1217       src += stride;
1218       dst += stride;
1219     }
1220 }
1221
1222 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1223     int i,j;
1224     for (i=0; i < height; i++) {
1225       for (j=0; j < width; j++) {
1226         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1227       }
1228       src += stride;
1229       dst += stride;
1230     }
1231 }
1232
1233 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1234     switch(width){
1235     case 2: avg_pixels2_c (dst, src, stride, height); break;
1236     case 4: avg_pixels4_c (dst, src, stride, height); break;
1237     case 8: avg_pixels8_c (dst, src, stride, height); break;
1238     case 16:avg_pixels16_c(dst, src, stride, height); break;
1239     }
1240 }
1241
1242 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1243     int i,j;
1244     for (i=0; i < height; i++) {
1245       for (j=0; j < width; j++) {
1246         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1247       }
1248       src += stride;
1249       dst += stride;
1250     }
1251 }
1252
1253 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1254     int i,j;
1255     for (i=0; i < height; i++) {
1256       for (j=0; j < width; j++) {
1257         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1258       }
1259       src += stride;
1260       dst += stride;
1261     }
1262 }
1263
1264 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1265     int i,j;
1266     for (i=0; i < height; i++) {
1267       for (j=0; j < width; j++) {
1268         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1269       }
1270       src += stride;
1271       dst += stride;
1272     }
1273 }
1274
1275 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1276     int i,j;
1277     for (i=0; i < height; i++) {
1278       for (j=0; j < width; j++) {
1279         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1280       }
1281       src += stride;
1282       dst += stride;
1283     }
1284 }
1285
1286 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1287     int i,j;
1288     for (i=0; i < height; i++) {
1289       for (j=0; j < width; j++) {
1290         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1291       }
1292       src += stride;
1293       dst += stride;
1294     }
1295 }
1296
1297 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1298     int i,j;
1299     for (i=0; i < height; i++) {
1300       for (j=0; j < width; j++) {
1301         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1302       }
1303       src += stride;
1304       dst += stride;
1305     }
1306 }
1307
1308 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1309     int i,j;
1310     for (i=0; i < height; i++) {
1311       for (j=0; j < width; j++) {
1312         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1313       }
1314       src += stride;
1315       dst += stride;
1316     }
1317 }
1318
1319 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1320     int i,j;
1321     for (i=0; i < height; i++) {
1322       for (j=0; j < width; j++) {
1323         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1324       }
1325       src += stride;
1326       dst += stride;
1327     }
1328 }
1329 #if 0
1330 #define TPEL_WIDTH(width)\
1331 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1332     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1333 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1334     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1335 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1336     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1337 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1338     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1339 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1340     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1341 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1342     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1343 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1344     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1345 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1346     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1347 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1348     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1349 #endif
1350
1351 #define H264_CHROMA_MC(OPNAME, OP)\
1352 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1353     const int A=(8-x)*(8-y);\
1354     const int B=(  x)*(8-y);\
1355     const int C=(8-x)*(  y);\
1356     const int D=(  x)*(  y);\
1357     int i;\
1358     \
1359     assert(x<8 && y<8 && x>=0 && y>=0);\
1360 \
1361     for(i=0; i<h; i++)\
1362     {\
1363         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1364         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1365         dst+= stride;\
1366         src+= stride;\
1367     }\
1368 }\
1369 \
1370 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1371     const int A=(8-x)*(8-y);\
1372     const int B=(  x)*(8-y);\
1373     const int C=(8-x)*(  y);\
1374     const int D=(  x)*(  y);\
1375     int i;\
1376     \
1377     assert(x<8 && y<8 && x>=0 && y>=0);\
1378 \
1379     for(i=0; i<h; i++)\
1380     {\
1381         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1382         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1383         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1384         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1385         dst+= stride;\
1386         src+= stride;\
1387     }\
1388 }\
1389 \
1390 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1391     const int A=(8-x)*(8-y);\
1392     const int B=(  x)*(8-y);\
1393     const int C=(8-x)*(  y);\
1394     const int D=(  x)*(  y);\
1395     int i;\
1396     \
1397     assert(x<8 && y<8 && x>=0 && y>=0);\
1398 \
1399     for(i=0; i<h; i++)\
1400     {\
1401         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1402         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1403         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1404         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1405         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1406         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1407         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1408         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1409         dst+= stride;\
1410         src+= stride;\
1411     }\
1412 }
1413
1414 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1415 #define op_put(a, b) a = (((b) + 32)>>6)
1416
1417 H264_CHROMA_MC(put_       , op_put)
1418 H264_CHROMA_MC(avg_       , op_avg)
1419 #undef op_avg
1420 #undef op_put
1421
1422 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1423 {
1424     int i;
1425     for(i=0; i<h; i++)
1426     {
1427         ST32(dst   , LD32(src   ));
1428         dst+=dstStride;
1429         src+=srcStride;
1430     }
1431 }
1432
1433 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1434 {
1435     int i;
1436     for(i=0; i<h; i++)
1437     {
1438         ST32(dst   , LD32(src   ));
1439         ST32(dst+4 , LD32(src+4 ));
1440         dst+=dstStride;
1441         src+=srcStride;
1442     }
1443 }
1444
1445 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1446 {
1447     int i;
1448     for(i=0; i<h; i++)
1449     {
1450         ST32(dst   , LD32(src   ));
1451         ST32(dst+4 , LD32(src+4 ));
1452         ST32(dst+8 , LD32(src+8 ));
1453         ST32(dst+12, LD32(src+12));
1454         dst+=dstStride;
1455         src+=srcStride;
1456     }
1457 }
1458
1459 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1460 {
1461     int i;
1462     for(i=0; i<h; i++)
1463     {
1464         ST32(dst   , LD32(src   ));
1465         ST32(dst+4 , LD32(src+4 ));
1466         ST32(dst+8 , LD32(src+8 ));
1467         ST32(dst+12, LD32(src+12));
1468         dst[16]= src[16];
1469         dst+=dstStride;
1470         src+=srcStride;
1471     }
1472 }
1473
1474 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1475 {
1476     int i;
1477     for(i=0; i<h; i++)
1478     {
1479         ST32(dst   , LD32(src   ));
1480         ST32(dst+4 , LD32(src+4 ));
1481         dst[8]= src[8];
1482         dst+=dstStride;
1483         src+=srcStride;
1484     }
1485 }
1486
1487
1488 #define QPEL_MC(r, OPNAME, RND, OP) \
1489 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1490     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1491     int i;\
1492     for(i=0; i<h; i++)\
1493     {\
1494         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1495         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1496         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1497         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1498         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1499         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1500         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1501         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1502         dst+=dstStride;\
1503         src+=srcStride;\
1504     }\
1505 }\
1506 \
1507 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1508     const int w=8;\
1509     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1510     int i;\
1511     for(i=0; i<w; i++)\
1512     {\
1513         const int src0= src[0*srcStride];\
1514         const int src1= src[1*srcStride];\
1515         const int src2= src[2*srcStride];\
1516         const int src3= src[3*srcStride];\
1517         const int src4= src[4*srcStride];\
1518         const int src5= src[5*srcStride];\
1519         const int src6= src[6*srcStride];\
1520         const int src7= src[7*srcStride];\
1521         const int src8= src[8*srcStride];\
1522         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1523         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1524         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1525         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1526         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1527         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1528         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1529         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1530         dst++;\
1531         src++;\
1532     }\
1533 }\
1534 \
1535 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1536     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1537     int i;\
1538     \
1539     for(i=0; i<h; i++)\
1540     {\
1541         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1542         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1543         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1544         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1545         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1546         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1547         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1548         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1549         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1550         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1551         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1552         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1553         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1554         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1555         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1556         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1557         dst+=dstStride;\
1558         src+=srcStride;\
1559     }\
1560 }\
1561 \
1562 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1563     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1564     int i;\
1565     const int w=16;\
1566     for(i=0; i<w; i++)\
1567     {\
1568         const int src0= src[0*srcStride];\
1569         const int src1= src[1*srcStride];\
1570         const int src2= src[2*srcStride];\
1571         const int src3= src[3*srcStride];\
1572         const int src4= src[4*srcStride];\
1573         const int src5= src[5*srcStride];\
1574         const int src6= src[6*srcStride];\
1575         const int src7= src[7*srcStride];\
1576         const int src8= src[8*srcStride];\
1577         const int src9= src[9*srcStride];\
1578         const int src10= src[10*srcStride];\
1579         const int src11= src[11*srcStride];\
1580         const int src12= src[12*srcStride];\
1581         const int src13= src[13*srcStride];\
1582         const int src14= src[14*srcStride];\
1583         const int src15= src[15*srcStride];\
1584         const int src16= src[16*srcStride];\
1585         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1586         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1587         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1588         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1589         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1590         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1591         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1592         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1593         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1594         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1595         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1596         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1597         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1598         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1599         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1600         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1601         dst++;\
1602         src++;\
1603     }\
1604 }\
1605 \
1606 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1607     OPNAME ## pixels8_c(dst, src, stride, 8);\
1608 }\
1609 \
1610 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1611     uint8_t half[64];\
1612     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1613     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1614 }\
1615 \
1616 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1617     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1618 }\
1619 \
1620 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1621     uint8_t half[64];\
1622     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1623     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1624 }\
1625 \
1626 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1627     uint8_t full[16*9];\
1628     uint8_t half[64];\
1629     copy_block9(full, src, 16, stride, 9);\
1630     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1631     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1632 }\
1633 \
1634 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1635     uint8_t full[16*9];\
1636     copy_block9(full, src, 16, stride, 9);\
1637     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1638 }\
1639 \
1640 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1641     uint8_t full[16*9];\
1642     uint8_t half[64];\
1643     copy_block9(full, src, 16, stride, 9);\
1644     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1645     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1646 }\
1647 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1648     uint8_t full[16*9];\
1649     uint8_t halfH[72];\
1650     uint8_t halfV[64];\
1651     uint8_t halfHV[64];\
1652     copy_block9(full, src, 16, stride, 9);\
1653     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1654     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1655     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1656     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1657 }\
1658 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1659     uint8_t full[16*9];\
1660     uint8_t halfH[72];\
1661     uint8_t halfHV[64];\
1662     copy_block9(full, src, 16, stride, 9);\
1663     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1664     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1665     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1666     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1667 }\
1668 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1669     uint8_t full[16*9];\
1670     uint8_t halfH[72];\
1671     uint8_t halfV[64];\
1672     uint8_t halfHV[64];\
1673     copy_block9(full, src, 16, stride, 9);\
1674     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1675     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1676     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1677     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1678 }\
1679 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1680     uint8_t full[16*9];\
1681     uint8_t halfH[72];\
1682     uint8_t halfHV[64];\
1683     copy_block9(full, src, 16, stride, 9);\
1684     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1685     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1686     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1687     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1688 }\
1689 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1690     uint8_t full[16*9];\
1691     uint8_t halfH[72];\
1692     uint8_t halfV[64];\
1693     uint8_t halfHV[64];\
1694     copy_block9(full, src, 16, stride, 9);\
1695     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1696     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1697     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1698     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1699 }\
1700 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1701     uint8_t full[16*9];\
1702     uint8_t halfH[72];\
1703     uint8_t halfHV[64];\
1704     copy_block9(full, src, 16, stride, 9);\
1705     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1706     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1707     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1708     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1709 }\
1710 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1711     uint8_t full[16*9];\
1712     uint8_t halfH[72];\
1713     uint8_t halfV[64];\
1714     uint8_t halfHV[64];\
1715     copy_block9(full, src, 16, stride, 9);\
1716     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1717     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1718     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1719     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1720 }\
1721 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1722     uint8_t full[16*9];\
1723     uint8_t halfH[72];\
1724     uint8_t halfHV[64];\
1725     copy_block9(full, src, 16, stride, 9);\
1726     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1727     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1728     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1729     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1730 }\
1731 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1732     uint8_t halfH[72];\
1733     uint8_t halfHV[64];\
1734     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1735     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1736     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1737 }\
1738 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1739     uint8_t halfH[72];\
1740     uint8_t halfHV[64];\
1741     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1742     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1743     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1744 }\
1745 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1746     uint8_t full[16*9];\
1747     uint8_t halfH[72];\
1748     uint8_t halfV[64];\
1749     uint8_t halfHV[64];\
1750     copy_block9(full, src, 16, stride, 9);\
1751     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1752     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1753     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1754     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1755 }\
1756 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1757     uint8_t full[16*9];\
1758     uint8_t halfH[72];\
1759     copy_block9(full, src, 16, stride, 9);\
1760     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1761     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1762     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1763 }\
1764 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1765     uint8_t full[16*9];\
1766     uint8_t halfH[72];\
1767     uint8_t halfV[64];\
1768     uint8_t halfHV[64];\
1769     copy_block9(full, src, 16, stride, 9);\
1770     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1771     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1772     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1773     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1774 }\
1775 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1776     uint8_t full[16*9];\
1777     uint8_t halfH[72];\
1778     copy_block9(full, src, 16, stride, 9);\
1779     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1780     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1781     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1782 }\
1783 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1784     uint8_t halfH[72];\
1785     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1786     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1787 }\
1788 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1789     OPNAME ## pixels16_c(dst, src, stride, 16);\
1790 }\
1791 \
1792 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1793     uint8_t half[256];\
1794     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1795     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1796 }\
1797 \
1798 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1799     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1800 }\
1801 \
1802 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1803     uint8_t half[256];\
1804     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1805     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1806 }\
1807 \
1808 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1809     uint8_t full[24*17];\
1810     uint8_t half[256];\
1811     copy_block17(full, src, 24, stride, 17);\
1812     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1813     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1814 }\
1815 \
1816 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1817     uint8_t full[24*17];\
1818     copy_block17(full, src, 24, stride, 17);\
1819     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1820 }\
1821 \
1822 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1823     uint8_t full[24*17];\
1824     uint8_t half[256];\
1825     copy_block17(full, src, 24, stride, 17);\
1826     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1827     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1828 }\
1829 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1830     uint8_t full[24*17];\
1831     uint8_t halfH[272];\
1832     uint8_t halfV[256];\
1833     uint8_t halfHV[256];\
1834     copy_block17(full, src, 24, stride, 17);\
1835     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1836     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1837     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1838     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1839 }\
1840 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1841     uint8_t full[24*17];\
1842     uint8_t halfH[272];\
1843     uint8_t halfHV[256];\
1844     copy_block17(full, src, 24, stride, 17);\
1845     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1846     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1847     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1848     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1849 }\
1850 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1851     uint8_t full[24*17];\
1852     uint8_t halfH[272];\
1853     uint8_t halfV[256];\
1854     uint8_t halfHV[256];\
1855     copy_block17(full, src, 24, stride, 17);\
1856     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1857     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1858     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1859     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1860 }\
1861 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1862     uint8_t full[24*17];\
1863     uint8_t halfH[272];\
1864     uint8_t halfHV[256];\
1865     copy_block17(full, src, 24, stride, 17);\
1866     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1867     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1868     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1869     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1870 }\
1871 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1872     uint8_t full[24*17];\
1873     uint8_t halfH[272];\
1874     uint8_t halfV[256];\
1875     uint8_t halfHV[256];\
1876     copy_block17(full, src, 24, stride, 17);\
1877     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1878     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1879     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1880     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1881 }\
1882 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1883     uint8_t full[24*17];\
1884     uint8_t halfH[272];\
1885     uint8_t halfHV[256];\
1886     copy_block17(full, src, 24, stride, 17);\
1887     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1888     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1889     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1890     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1891 }\
1892 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1893     uint8_t full[24*17];\
1894     uint8_t halfH[272];\
1895     uint8_t halfV[256];\
1896     uint8_t halfHV[256];\
1897     copy_block17(full, src, 24, stride, 17);\
1898     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1899     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1900     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1901     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1902 }\
1903 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1904     uint8_t full[24*17];\
1905     uint8_t halfH[272];\
1906     uint8_t halfHV[256];\
1907     copy_block17(full, src, 24, stride, 17);\
1908     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1909     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1910     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1911     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1912 }\
1913 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1914     uint8_t halfH[272];\
1915     uint8_t halfHV[256];\
1916     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1917     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1918     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1919 }\
1920 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1921     uint8_t halfH[272];\
1922     uint8_t halfHV[256];\
1923     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1924     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1925     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1926 }\
1927 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1928     uint8_t full[24*17];\
1929     uint8_t halfH[272];\
1930     uint8_t halfV[256];\
1931     uint8_t halfHV[256];\
1932     copy_block17(full, src, 24, stride, 17);\
1933     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1934     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1935     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1936     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1937 }\
1938 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1939     uint8_t full[24*17];\
1940     uint8_t halfH[272];\
1941     copy_block17(full, src, 24, stride, 17);\
1942     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1943     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1944     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1945 }\
1946 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1947     uint8_t full[24*17];\
1948     uint8_t halfH[272];\
1949     uint8_t halfV[256];\
1950     uint8_t halfHV[256];\
1951     copy_block17(full, src, 24, stride, 17);\
1952     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1953     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1954     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1955     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1956 }\
1957 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1958     uint8_t full[24*17];\
1959     uint8_t halfH[272];\
1960     copy_block17(full, src, 24, stride, 17);\
1961     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1962     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1963     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1964 }\
1965 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1966     uint8_t halfH[272];\
1967     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1968     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1969 }
1970
1971 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1972 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1973 #define op_put(a, b) a = cm[((b) + 16)>>5]
1974 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1975
1976 QPEL_MC(0, put_       , _       , op_put)
1977 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1978 QPEL_MC(0, avg_       , _       , op_avg)
1979 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1980 #undef op_avg
1981 #undef op_avg_no_rnd
1982 #undef op_put
1983 #undef op_put_no_rnd
1984
1985 #if 1
1986 #define H264_LOWPASS(OPNAME, OP, OP2) \
1987 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1988     const int h=4;\
1989     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1990     int i;\
1991     for(i=0; i<h; i++)\
1992     {\
1993         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1994         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1995         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1996         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1997         dst+=dstStride;\
1998         src+=srcStride;\
1999     }\
2000 }\
2001 \
2002 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2003     const int w=4;\
2004     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2005     int i;\
2006     for(i=0; i<w; i++)\
2007     {\
2008         const int srcB= src[-2*srcStride];\
2009         const int srcA= src[-1*srcStride];\
2010         const int src0= src[0 *srcStride];\
2011         const int src1= src[1 *srcStride];\
2012         const int src2= src[2 *srcStride];\
2013         const int src3= src[3 *srcStride];\
2014         const int src4= src[4 *srcStride];\
2015         const int src5= src[5 *srcStride];\
2016         const int src6= src[6 *srcStride];\
2017         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2018         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2019         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2020         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2021         dst++;\
2022         src++;\
2023     }\
2024 }\
2025 \
2026 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2027     const int h=4;\
2028     const int w=4;\
2029     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2030     int i;\
2031     src -= 2*srcStride;\
2032     for(i=0; i<h+5; i++)\
2033     {\
2034         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2035         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2036         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2037         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2038         tmp+=tmpStride;\
2039         src+=srcStride;\
2040     }\
2041     tmp -= tmpStride*(h+5-2);\
2042     for(i=0; i<w; i++)\
2043     {\
2044         const int tmpB= tmp[-2*tmpStride];\
2045         const int tmpA= tmp[-1*tmpStride];\
2046         const int tmp0= tmp[0 *tmpStride];\
2047         const int tmp1= tmp[1 *tmpStride];\
2048         const int tmp2= tmp[2 *tmpStride];\
2049         const int tmp3= tmp[3 *tmpStride];\
2050         const int tmp4= tmp[4 *tmpStride];\
2051         const int tmp5= tmp[5 *tmpStride];\
2052         const int tmp6= tmp[6 *tmpStride];\
2053         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2054         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2055         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2056         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2057         dst++;\
2058         tmp++;\
2059     }\
2060 }\
2061 \
2062 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2063     const int h=8;\
2064     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2065     int i;\
2066     for(i=0; i<h; i++)\
2067     {\
2068         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2069         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2070         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2071         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2072         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2073         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2074         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2075         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2076         dst+=dstStride;\
2077         src+=srcStride;\
2078     }\
2079 }\
2080 \
2081 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2082     const int w=8;\
2083     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2084     int i;\
2085     for(i=0; i<w; i++)\
2086     {\
2087         const int srcB= src[-2*srcStride];\
2088         const int srcA= src[-1*srcStride];\
2089         const int src0= src[0 *srcStride];\
2090         const int src1= src[1 *srcStride];\
2091         const int src2= src[2 *srcStride];\
2092         const int src3= src[3 *srcStride];\
2093         const int src4= src[4 *srcStride];\
2094         const int src5= src[5 *srcStride];\
2095         const int src6= src[6 *srcStride];\
2096         const int src7= src[7 *srcStride];\
2097         const int src8= src[8 *srcStride];\
2098         const int src9= src[9 *srcStride];\
2099         const int src10=src[10*srcStride];\
2100         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2101         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2102         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2103         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2104         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2105         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2106         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2107         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2108         dst++;\
2109         src++;\
2110     }\
2111 }\
2112 \
2113 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2114     const int h=8;\
2115     const int w=8;\
2116     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2117     int i;\
2118     src -= 2*srcStride;\
2119     for(i=0; i<h+5; i++)\
2120     {\
2121         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2122         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2123         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2124         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2125         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2126         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2127         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2128         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2129         tmp+=tmpStride;\
2130         src+=srcStride;\
2131     }\
2132     tmp -= tmpStride*(h+5-2);\
2133     for(i=0; i<w; i++)\
2134     {\
2135         const int tmpB= tmp[-2*tmpStride];\
2136         const int tmpA= tmp[-1*tmpStride];\
2137         const int tmp0= tmp[0 *tmpStride];\
2138         const int tmp1= tmp[1 *tmpStride];\
2139         const int tmp2= tmp[2 *tmpStride];\
2140         const int tmp3= tmp[3 *tmpStride];\
2141         const int tmp4= tmp[4 *tmpStride];\
2142         const int tmp5= tmp[5 *tmpStride];\
2143         const int tmp6= tmp[6 *tmpStride];\
2144         const int tmp7= tmp[7 *tmpStride];\
2145         const int tmp8= tmp[8 *tmpStride];\
2146         const int tmp9= tmp[9 *tmpStride];\
2147         const int tmp10=tmp[10*tmpStride];\
2148         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2149         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2150         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2151         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2152         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2153         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2154         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2155         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2156         dst++;\
2157         tmp++;\
2158     }\
2159 }\
2160 \
2161 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2162     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2163     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2164     src += 8*srcStride;\
2165     dst += 8*dstStride;\
2166     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2167     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2168 }\
2169 \
2170 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2171     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2172     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2173     src += 8*srcStride;\
2174     dst += 8*dstStride;\
2175     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2176     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2177 }\
2178 \
2179 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2180     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2181     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2182     src += 8*srcStride;\
2183     dst += 8*dstStride;\
2184     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2185     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2186 }\
2187
2188 #define H264_MC(OPNAME, SIZE) \
2189 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2190     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2191 }\
2192 \
2193 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2194     uint8_t half[SIZE*SIZE];\
2195     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2196     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2197 }\
2198 \
2199 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2200     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2201 }\
2202 \
2203 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2204     uint8_t half[SIZE*SIZE];\
2205     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2206     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2207 }\
2208 \
2209 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2210     uint8_t full[SIZE*(SIZE+5)];\
2211     uint8_t * const full_mid= full + SIZE*2;\
2212     uint8_t half[SIZE*SIZE];\
2213     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2214     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2215     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2216 }\
2217 \
2218 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2219     uint8_t full[SIZE*(SIZE+5)];\
2220     uint8_t * const full_mid= full + SIZE*2;\
2221     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2222     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2223 }\
2224 \
2225 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2226     uint8_t full[SIZE*(SIZE+5)];\
2227     uint8_t * const full_mid= full + SIZE*2;\
2228     uint8_t half[SIZE*SIZE];\
2229     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2230     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2231     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2232 }\
2233 \
2234 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2235     uint8_t full[SIZE*(SIZE+5)];\
2236     uint8_t * const full_mid= full + SIZE*2;\
2237     uint8_t halfH[SIZE*SIZE];\
2238     uint8_t halfV[SIZE*SIZE];\
2239     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2240     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2241     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2242     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2243 }\
2244 \
2245 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2246     uint8_t full[SIZE*(SIZE+5)];\
2247     uint8_t * const full_mid= full + SIZE*2;\
2248     uint8_t halfH[SIZE*SIZE];\
2249     uint8_t halfV[SIZE*SIZE];\
2250     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2251     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2252     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2253     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2254 }\
2255 \
2256 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2257     uint8_t full[SIZE*(SIZE+5)];\
2258     uint8_t * const full_mid= full + SIZE*2;\
2259     uint8_t halfH[SIZE*SIZE];\
2260     uint8_t halfV[SIZE*SIZE];\
2261     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2262     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2263     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2264     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2265 }\
2266 \
2267 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2268     uint8_t full[SIZE*(SIZE+5)];\
2269     uint8_t * const full_mid= full + SIZE*2;\
2270     uint8_t halfH[SIZE*SIZE];\
2271     uint8_t halfV[SIZE*SIZE];\
2272     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2273     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2274     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2275     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2276 }\
2277 \
2278 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2279     int16_t tmp[SIZE*(SIZE+5)];\
2280     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2281 }\
2282 \
2283 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2284     int16_t tmp[SIZE*(SIZE+5)];\
2285     uint8_t halfH[SIZE*SIZE];\
2286     uint8_t halfHV[SIZE*SIZE];\
2287     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2288     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2289     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2290 }\
2291 \
2292 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2293     int16_t tmp[SIZE*(SIZE+5)];\
2294     uint8_t halfH[SIZE*SIZE];\
2295     uint8_t halfHV[SIZE*SIZE];\
2296     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2297     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2298     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2299 }\
2300 \
2301 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2302     uint8_t full[SIZE*(SIZE+5)];\
2303     uint8_t * const full_mid= full + SIZE*2;\
2304     int16_t tmp[SIZE*(SIZE+5)];\
2305     uint8_t halfV[SIZE*SIZE];\
2306     uint8_t halfHV[SIZE*SIZE];\
2307     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2308     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2309     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2310     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2311 }\
2312 \
2313 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2314     uint8_t full[SIZE*(SIZE+5)];\
2315     uint8_t * const full_mid= full + SIZE*2;\
2316     int16_t tmp[SIZE*(SIZE+5)];\
2317     uint8_t halfV[SIZE*SIZE];\
2318     uint8_t halfHV[SIZE*SIZE];\
2319     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2320     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2321     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2322     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2323 }\
2324
2325 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2326 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2327 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2328 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2329 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2330
2331 H264_LOWPASS(put_       , op_put, op2_put)
2332 H264_LOWPASS(avg_       , op_avg, op2_avg)
2333 H264_MC(put_, 4)
2334 H264_MC(put_, 8)
2335 H264_MC(put_, 16)
2336 H264_MC(avg_, 4)
2337 H264_MC(avg_, 8)
2338 H264_MC(avg_, 16)
2339
2340 #undef op_avg
2341 #undef op_put
2342 #undef op2_avg
2343 #undef op2_put
2344 #endif
2345
2346 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2347     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2348     int i;
2349
2350     for(i=0; i<h; i++){
2351         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2352         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2353         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2354         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2355         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2356         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2357         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2358         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2359         dst+=dstStride;
2360         src+=srcStride;
2361     }
2362 }
2363
2364 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2365     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2366     int i;
2367
2368     for(i=0; i<w; i++){
2369         const int src_1= src[ -srcStride];
2370         const int src0 = src[0          ];
2371         const int src1 = src[  srcStride];
2372         const int src2 = src[2*srcStride];
2373         const int src3 = src[3*srcStride];
2374         const int src4 = src[4*srcStride];
2375         const int src5 = src[5*srcStride];
2376         const int src6 = src[6*srcStride];
2377         const int src7 = src[7*srcStride];
2378         const int src8 = src[8*srcStride];
2379         const int src9 = src[9*srcStride];
2380         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2381         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2382         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2383         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2384         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2385         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2386         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2387         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2388         src++;
2389         dst++;
2390     }
2391 }
2392
2393 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2394     put_pixels8_c(dst, src, stride, 8);
2395 }
2396
2397 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2398     uint8_t half[64];
2399     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2400     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2401 }
2402
2403 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2404     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2405 }
2406
2407 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2408     uint8_t half[64];
2409     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2410     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2411 }
2412
2413 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2414     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2415 }
2416
2417 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2418     uint8_t halfH[88];
2419     uint8_t halfV[64];
2420     uint8_t halfHV[64];
2421     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2422     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2423     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2424     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2425 }
2426 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2427     uint8_t halfH[88];
2428     uint8_t halfV[64];
2429     uint8_t halfHV[64];
2430     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2431     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2432     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2433     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2434 }
2435 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2436     uint8_t halfH[88];
2437     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2438     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2439 }
2440
2441 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2442     int x;
2443     const int strength= ff_h263_loop_filter_strength[qscale];
2444
2445     for(x=0; x<8; x++){
2446         int d1, d2, ad1;
2447         int p0= src[x-2*stride];
2448         int p1= src[x-1*stride];
2449         int p2= src[x+0*stride];
2450         int p3= src[x+1*stride];
2451         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2452
2453         if     (d<-2*strength) d1= 0;
2454         else if(d<-  strength) d1=-2*strength - d;
2455         else if(d<   strength) d1= d;
2456         else if(d< 2*strength) d1= 2*strength - d;
2457         else                   d1= 0;
2458
2459         p1 += d1;
2460         p2 -= d1;
2461         if(p1&256) p1= ~(p1>>31);
2462         if(p2&256) p2= ~(p2>>31);
2463
2464         src[x-1*stride] = p1;
2465         src[x+0*stride] = p2;
2466
2467         ad1= ABS(d1)>>1;
2468
2469         d2= clip((p0-p3)/4, -ad1, ad1);
2470
2471         src[x-2*stride] = p0 - d2;
2472         src[x+  stride] = p3 + d2;
2473     }
2474 }
2475
2476 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2477     int y;
2478     const int strength= ff_h263_loop_filter_strength[qscale];
2479
2480     for(y=0; y<8; y++){
2481         int d1, d2, ad1;
2482         int p0= src[y*stride-2];
2483         int p1= src[y*stride-1];
2484         int p2= src[y*stride+0];
2485         int p3= src[y*stride+1];
2486         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2487
2488         if     (d<-2*strength) d1= 0;
2489         else if(d<-  strength) d1=-2*strength - d;
2490         else if(d<   strength) d1= d;
2491         else if(d< 2*strength) d1= 2*strength - d;
2492         else                   d1= 0;
2493
2494         p1 += d1;
2495         p2 -= d1;
2496         if(p1&256) p1= ~(p1>>31);
2497         if(p2&256) p2= ~(p2>>31);
2498
2499         src[y*stride-1] = p1;
2500         src[y*stride+0] = p2;
2501
2502         ad1= ABS(d1)>>1;
2503
2504         d2= clip((p0-p3)/4, -ad1, ad1);
2505
2506         src[y*stride-2] = p0 - d2;
2507         src[y*stride+1] = p3 + d2;
2508     }
2509 }
2510
2511 static void h261_loop_filter_c(uint8_t *src, int stride){
2512     int x,y,xy,yz;
2513     int temp[64];
2514
2515     for(x=0; x<8; x++){
2516         temp[x      ] = 4*src[x           ];
2517         temp[x + 7*8] = 4*src[x + 7*stride];
2518     }
2519     for(y=1; y<7; y++){
2520         for(x=0; x<8; x++){
2521             xy = y * stride + x;
2522             yz = y * 8 + x;
2523             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2524         }
2525     }
2526
2527     for(y=0; y<8; y++){
2528         src[  y*stride] = (temp[  y*8] + 2)>>2;
2529         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2530         for(x=1; x<7; x++){
2531             xy = y * stride + x;
2532             yz = y * 8 + x;
2533             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2534         }
2535     }
2536 }
2537
2538 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2539 {
2540     int s, i;
2541
2542     s = 0;
2543     for(i=0;i<h;i++) {
2544         s += abs(pix1[0] - pix2[0]);
2545         s += abs(pix1[1] - pix2[1]);
2546         s += abs(pix1[2] - pix2[2]);
2547         s += abs(pix1[3] - pix2[3]);
2548         s += abs(pix1[4] - pix2[4]);
2549         s += abs(pix1[5] - pix2[5]);
2550         s += abs(pix1[6] - pix2[6]);
2551         s += abs(pix1[7] - pix2[7]);
2552         s += abs(pix1[8] - pix2[8]);
2553         s += abs(pix1[9] - pix2[9]);
2554         s += abs(pix1[10] - pix2[10]);
2555         s += abs(pix1[11] - pix2[11]);
2556         s += abs(pix1[12] - pix2[12]);
2557         s += abs(pix1[13] - pix2[13]);
2558         s += abs(pix1[14] - pix2[14]);
2559         s += abs(pix1[15] - pix2[15]);
2560         pix1 += line_size;
2561         pix2 += line_size;
2562     }
2563     return s;
2564 }
2565
2566 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2567 {
2568     int s, i;
2569
2570     s = 0;
2571     for(i=0;i<h;i++) {
2572         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2573         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2574         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2575         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2576         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2577         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2578         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2579         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2580         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2581         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2582         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2583         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2584         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2585         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2586         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2587         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2588         pix1 += line_size;
2589         pix2 += line_size;
2590     }
2591     return s;
2592 }
2593
2594 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2595 {
2596     int s, i;
2597     uint8_t *pix3 = pix2 + line_size;
2598
2599     s = 0;
2600     for(i=0;i<h;i++) {
2601         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2602         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2603         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2604         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2605         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2606         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2607         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2608         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2609         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2610         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2611         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2612         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2613         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2614         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2615         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2616         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2617         pix1 += line_size;
2618         pix2 += line_size;
2619         pix3 += line_size;
2620     }
2621     return s;
2622 }
2623
2624 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2625 {
2626     int s, i;
2627     uint8_t *pix3 = pix2 + line_size;
2628
2629     s = 0;
2630     for(i=0;i<h;i++) {
2631         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2632         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2633         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2634         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2635         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2636         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2637         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2638         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2639         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2640         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2641         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2642         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2643         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2644         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2645         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2646         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2647         pix1 += line_size;
2648         pix2 += line_size;
2649         pix3 += line_size;
2650     }
2651     return s;
2652 }
2653
2654 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2655 {
2656     int s, i;
2657
2658     s = 0;
2659     for(i=0;i<h;i++) {
2660         s += abs(pix1[0] - pix2[0]);
2661         s += abs(pix1[1] - pix2[1]);
2662         s += abs(pix1[2] - pix2[2]);
2663         s += abs(pix1[3] - pix2[3]);
2664         s += abs(pix1[4] - pix2[4]);
2665         s += abs(pix1[5] - pix2[5]);
2666         s += abs(pix1[6] - pix2[6]);
2667         s += abs(pix1[7] - pix2[7]);
2668         pix1 += line_size;
2669         pix2 += line_size;
2670     }
2671     return s;
2672 }
2673
2674 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2675 {
2676     int s, i;
2677
2678     s = 0;
2679     for(i=0;i<h;i++) {
2680         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2681         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2682         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2683         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2684         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2685         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2686         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2687         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2688         pix1 += line_size;
2689         pix2 += line_size;
2690     }
2691     return s;
2692 }
2693
2694 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2695 {
2696     int s, i;
2697     uint8_t *pix3 = pix2 + line_size;
2698
2699     s = 0;
2700     for(i=0;i<h;i++) {
2701         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2702         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2703         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2704         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2705         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2706         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2707         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2708         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2709         pix1 += line_size;
2710         pix2 += line_size;
2711         pix3 += line_size;
2712     }
2713     return s;
2714 }
2715
2716 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2717 {
2718     int s, i;
2719     uint8_t *pix3 = pix2 + line_size;
2720
2721     s = 0;
2722     for(i=0;i<h;i++) {
2723         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2724         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2725         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2726         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2727         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2728         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2729         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2730         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2731         pix1 += line_size;
2732         pix2 += line_size;
2733         pix3 += line_size;
2734     }
2735     return s;
2736 }
2737
2738 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2739     int score1=0;
2740     int score2=0;
2741     int x,y;
2742
2743     for(y=0; y<h; y++){
2744         for(x=0; x<16; x++){
2745             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2746         }
2747         if(y+1<h){
2748             for(x=0; x<15; x++){
2749                 score2+= ABS(  s1[x  ] - s1[x  +stride]
2750                              - s1[x+1] + s1[x+1+stride])
2751                         -ABS(  s2[x  ] - s2[x  +stride]
2752                              - s2[x+1] + s2[x+1+stride]);
2753             }
2754         }
2755         s1+= stride;
2756         s2+= stride;
2757     }
2758
2759     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2760     else  return score1 + ABS(score2)*8;
2761 }
2762
2763 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2764     int score1=0;
2765     int score2=0;
2766     int x,y;
2767
2768     for(y=0; y<h; y++){
2769         for(x=0; x<8; x++){
2770             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2771         }
2772         if(y+1<h){
2773             for(x=0; x<7; x++){
2774                 score2+= ABS(  s1[x  ] - s1[x  +stride]
2775                              - s1[x+1] + s1[x+1+stride])
2776                         -ABS(  s2[x  ] - s2[x  +stride]
2777                              - s2[x+1] + s2[x+1+stride]);
2778             }
2779         }
2780         s1+= stride;
2781         s2+= stride;
2782     }
2783
2784     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2785     else  return score1 + ABS(score2)*8;
2786 }
2787
2788 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2789     int i;
2790     unsigned int sum=0;
2791
2792     for(i=0; i<8*8; i++){
2793         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2794         int w= weight[i];
2795         b>>= RECON_SHIFT;
2796         assert(-512<b && b<512);
2797
2798         sum += (w*b)*(w*b)>>4;
2799     }
2800     return sum>>2;
2801 }
2802
2803 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2804     int i;
2805
2806     for(i=0; i<8*8; i++){
2807         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2808     }
2809 }
2810
2811 /**
2812  * permutes an 8x8 block.
2813  * @param block the block which will be permuted according to the given permutation vector
2814  * @param permutation the permutation vector
2815  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2816  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2817  *                  (inverse) permutated to scantable order!
2818  */
2819 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2820 {
2821     int i;
2822     DCTELEM temp[64];
2823
2824     if(last<=0) return;
2825     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2826
2827     for(i=0; i<=last; i++){
2828         const int j= scantable[i];
2829         temp[j]= block[j];
2830         block[j]=0;
2831     }
2832
2833     for(i=0; i<=last; i++){
2834         const int j= scantable[i];
2835         const int perm_j= permutation[j];
2836         block[perm_j]= temp[j];
2837     }
2838 }
2839
2840 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2841     return 0;
2842 }
2843
2844 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2845     int i;
2846
2847     memset(cmp, 0, sizeof(void*)*5);
2848
2849     for(i=0; i<5; i++){
2850         switch(type&0xFF){
2851         case FF_CMP_SAD:
2852             cmp[i]= c->sad[i];
2853             break;
2854         case FF_CMP_SATD:
2855             cmp[i]= c->hadamard8_diff[i];
2856             break;
2857         case FF_CMP_SSE:
2858             cmp[i]= c->sse[i];
2859             break;
2860         case FF_CMP_DCT:
2861             cmp[i]= c->dct_sad[i];
2862             break;
2863         case FF_CMP_PSNR:
2864             cmp[i]= c->quant_psnr[i];
2865             break;
2866         case FF_CMP_BIT:
2867             cmp[i]= c->bit[i];
2868             break;
2869         case FF_CMP_RD:
2870             cmp[i]= c->rd[i];
2871             break;
2872         case FF_CMP_VSAD:
2873             cmp[i]= c->vsad[i];
2874             break;
2875         case FF_CMP_VSSE:
2876             cmp[i]= c->vsse[i];
2877             break;
2878         case FF_CMP_ZERO:
2879             cmp[i]= zero_cmp;
2880             break;
2881         case FF_CMP_NSSE:
2882             cmp[i]= c->nsse[i];
2883             break;
2884         case FF_CMP_W53:
2885             cmp[i]= c->w53[i];
2886             break;
2887         case FF_CMP_W97:
2888             cmp[i]= c->w97[i];
2889             break;
2890         default:
2891             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2892         }
2893     }
2894 }
2895
2896 /**
2897  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2898  */
2899 static void clear_blocks_c(DCTELEM *blocks)
2900 {
2901     memset(blocks, 0, sizeof(DCTELEM)*6*64);
2902 }
2903
2904 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2905     int i;
2906     for(i=0; i+7<w; i+=8){
2907         dst[i+0] += src[i+0];
2908         dst[i+1] += src[i+1];
2909         dst[i+2] += src[i+2];
2910         dst[i+3] += src[i+3];
2911         dst[i+4] += src[i+4];
2912         dst[i+5] += src[i+5];
2913         dst[i+6] += src[i+6];
2914         dst[i+7] += src[i+7];
2915     }
2916     for(; i<w; i++)
2917         dst[i+0] += src[i+0];
2918 }
2919
2920 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2921     int i;
2922     for(i=0; i+7<w; i+=8){
2923         dst[i+0] = src1[i+0]-src2[i+0];
2924         dst[i+1] = src1[i+1]-src2[i+1];
2925         dst[i+2] = src1[i+2]-src2[i+2];
2926         dst[i+3] = src1[i+3]-src2[i+3];
2927         dst[i+4] = src1[i+4]-src2[i+4];
2928         dst[i+5] = src1[i+5]-src2[i+5];
2929         dst[i+6] = src1[i+6]-src2[i+6];
2930         dst[i+7] = src1[i+7]-src2[i+7];
2931     }
2932     for(; i<w; i++)
2933         dst[i+0] = src1[i+0]-src2[i+0];
2934 }
2935
2936 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2937     int i;
2938     uint8_t l, lt;
2939
2940     l= *left;
2941     lt= *left_top;
2942
2943     for(i=0; i<w; i++){
2944         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2945         lt= src1[i];
2946         l= src2[i];
2947         dst[i]= l - pred;
2948     }
2949
2950     *left= l;
2951     *left_top= lt;
2952 }
2953
2954 #define BUTTERFLY2(o1,o2,i1,i2) \
2955 o1= (i1)+(i2);\
2956 o2= (i1)-(i2);
2957
2958 #define BUTTERFLY1(x,y) \
2959 {\
2960     int a,b;\
2961     a= x;\
2962     b= y;\
2963     x= a+b;\
2964     y= a-b;\
2965 }
2966
2967 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2968
2969 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2970     int i;
2971     int temp[64];
2972     int sum=0;
2973
2974     assert(h==8);
2975
2976     for(i=0; i<8; i++){
2977         //FIXME try pointer walks
2978         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2979         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2980         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2981         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2982
2983         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2984         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2985         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2986         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2987
2988         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2989         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2990         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2991         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2992     }
2993
2994     for(i=0; i<8; i++){
2995         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2996         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2997         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2998         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2999
3000         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3001         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3002         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3003         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3004
3005         sum +=
3006              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3007             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3008             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3009             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3010     }
3011 #if 0
3012 static int maxi=0;
3013 if(sum>maxi){
3014     maxi=sum;
3015     printf("MAX:%d\n", maxi);
3016 }
3017 #endif
3018     return sum;
3019 }
3020
3021 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3022     int i;
3023     int temp[64];
3024     int sum=0;
3025
3026     assert(h==8);
3027
3028     for(i=0; i<8; i++){
3029         //FIXME try pointer walks
3030         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3031         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3032         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3033         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3034
3035         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3036         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3037         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3038         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3039
3040         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3041         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3042         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3043         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3044     }
3045
3046     for(i=0; i<8; i++){
3047         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3048         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3049         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3050         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3051
3052         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3053         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3054         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3055         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3056
3057         sum +=
3058              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3059             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3060             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3061             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3062     }
3063
3064     sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3065
3066     return sum;
3067 }
3068
3069 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3070     MpegEncContext * const s= (MpegEncContext *)c;
3071     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3072     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3073     int sum=0, i;
3074
3075     assert(h==8);
3076
3077     s->dsp.diff_pixels(temp, src1, src2, stride);
3078     s->dsp.fdct(temp);
3079
3080     for(i=0; i<64; i++)
3081         sum+= ABS(temp[i]);
3082
3083     return sum;
3084 }
3085
3086 void simple_idct(DCTELEM *block); //FIXME
3087
3088 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3089     MpegEncContext * const s= (MpegEncContext *)c;
3090     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3091     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3092     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3093     int sum=0, i;
3094
3095     assert(h==8);
3096     s->mb_intra=0;
3097
3098     s->dsp.diff_pixels(temp, src1, src2, stride);
3099
3100     memcpy(bak, temp, 64*sizeof(DCTELEM));
3101
3102     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3103     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3104     simple_idct(temp); //FIXME
3105
3106     for(i=0; i<64; i++)
3107         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3108
3109     return sum;
3110 }
3111
3112 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3113     MpegEncContext * const s= (MpegEncContext *)c;
3114     const uint8_t *scantable= s->intra_scantable.permutated;
3115     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3116     uint64_t __align8 aligned_bak[stride];
3117     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3118     uint8_t * const bak= (uint8_t*)aligned_bak;
3119     int i, last, run, bits, level, distoration, start_i;
3120     const int esc_length= s->ac_esc_length;
3121     uint8_t * length;
3122     uint8_t * last_length;
3123
3124     assert(h==8);
3125
3126     for(i=0; i<8; i++){
3127         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3128         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3129     }
3130
3131     s->dsp.diff_pixels(temp, src1, src2, stride);
3132
3133     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3134
3135     bits=0;
3136
3137     if (s->mb_intra) {
3138         start_i = 1;
3139         length     = s->intra_ac_vlc_length;
3140         last_length= s->intra_ac_vlc_last_length;
3141         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3142     } else {
3143         start_i = 0;
3144         length     = s->inter_ac_vlc_length;
3145         last_length= s->inter_ac_vlc_last_length;
3146     }
3147
3148     if(last>=start_i){
3149         run=0;
3150         for(i=start_i; i<last; i++){
3151             int j= scantable[i];
3152             level= temp[j];
3153
3154             if(level){
3155                 level+=64;
3156                 if((level&(~127)) == 0){
3157                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3158                 }else
3159                     bits+= esc_length;
3160                 run=0;
3161             }else
3162                 run++;
3163         }
3164         i= scantable[last];
3165
3166         level= temp[i] + 64;
3167
3168         assert(level - 64);
3169
3170         if((level&(~127)) == 0){
3171             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3172         }else
3173             bits+= esc_length;
3174
3175     }
3176
3177     if(last>=0){
3178         if(s->mb_intra)
3179             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3180         else
3181             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3182     }
3183
3184     s->dsp.idct_add(bak, stride, temp);
3185
3186     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3187
3188     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3189 }
3190
3191 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3192     MpegEncContext * const s= (MpegEncContext *)c;
3193     const uint8_t *scantable= s->intra_scantable.permutated;
3194     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3195     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3196     int i, last, run, bits, level, start_i;
3197     const int esc_length= s->ac_esc_length;
3198     uint8_t * length;
3199     uint8_t * last_length;
3200
3201     assert(h==8);
3202
3203     s->dsp.diff_pixels(temp, src1, src2, stride);
3204
3205     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3206
3207     bits=0;
3208
3209     if (s->mb_intra) {
3210         start_i = 1;
3211         length     = s->intra_ac_vlc_length;
3212         last_length= s->intra_ac_vlc_last_length;
3213         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3214     } else {
3215         start_i = 0;
3216         length     = s->inter_ac_vlc_length;
3217         last_length= s->inter_ac_vlc_last_length;
3218     }
3219
3220     if(last>=start_i){
3221         run=0;
3222         for(i=start_i; i<last; i++){
3223             int j= scantable[i];
3224             level= temp[j];
3225
3226             if(level){
3227                 level+=64;
3228                 if((level&(~127)) == 0){
3229                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3230                 }else
3231                     bits+= esc_length;
3232                 run=0;
3233             }else
3234                 run++;
3235         }
3236         i= scantable[last];
3237
3238         level= temp[i] + 64;
3239
3240         assert(level - 64);
3241
3242         if((level&(~127)) == 0){
3243             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3244         }else
3245             bits+= esc_length;
3246     }
3247
3248     return bits;
3249 }
3250
3251 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3252     int score=0;
3253     int x,y;
3254
3255     for(y=1; y<h; y++){
3256         for(x=0; x<16; x+=4){
3257             score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride])
3258                    +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3259         }
3260         s+= stride;
3261     }
3262
3263     return score;
3264 }
3265
3266 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3267     int score=0;
3268     int x,y;
3269
3270     for(y=1; y<h; y++){
3271         for(x=0; x<16; x++){
3272             score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3273         }
3274         s1+= stride;
3275         s2+= stride;
3276     }
3277
3278     return score;
3279 }
3280
3281 #define SQ(a) ((a)*(a))
3282 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3283     int score=0;
3284     int x,y;
3285
3286     for(y=1; y<h; y++){
3287         for(x=0; x<16; x+=4){
3288             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3289                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3290         }
3291         s+= stride;
3292     }
3293
3294     return score;
3295 }
3296
3297 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3298     int score=0;
3299     int x,y;
3300
3301     for(y=1; y<h; y++){
3302         for(x=0; x<16; x++){
3303             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3304         }
3305         s1+= stride;
3306         s2+= stride;
3307     }
3308
3309     return score;
3310 }
3311
3312 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3313 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3314 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3315 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3316 WARPER8_16_SQ(rd8x8_c, rd16_c)
3317 WARPER8_16_SQ(bit8x8_c, bit16_c)
3318
3319 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3320  converted */
3321 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3322 {
3323     j_rev_dct (block);
3324     put_pixels_clamped_c(block, dest, line_size);
3325 }
3326 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3327 {
3328     j_rev_dct (block);
3329     add_pixels_clamped_c(block, dest, line_size);
3330 }
3331
3332 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3333 {
3334     j_rev_dct4 (block);
3335     put_pixels_clamped4_c(block, dest, line_size);
3336 }
3337 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3338 {
3339     j_rev_dct4 (block);
3340     add_pixels_clamped4_c(block, dest, line_size);
3341 }
3342
3343 /* init static data */
3344 void dsputil_static_init(void)
3345 {
3346     int i;
3347
3348     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3349     for(i=0;i<MAX_NEG_CROP;i++) {
3350         cropTbl[i] = 0;
3351         cropTbl[i + MAX_NEG_CROP + 256] = 255;
3352     }
3353
3354     for(i=0;i<512;i++) {
3355         squareTbl[i] = (i - 256) * (i - 256);
3356     }
3357
3358     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3359 }
3360
3361
3362 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3363 {
3364     int i;
3365
3366 #ifdef CONFIG_ENCODERS
3367     if(avctx->dct_algo==FF_DCT_FASTINT) {
3368         c->fdct = fdct_ifast;
3369         c->fdct248 = fdct_ifast248;
3370     }
3371     else if(avctx->dct_algo==FF_DCT_FAAN) {
3372         c->fdct = ff_faandct;
3373         c->fdct248 = ff_faandct248;
3374     }
3375     else {
3376         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3377         c->fdct248 = ff_fdct248_islow;
3378     }
3379 #endif //CONFIG_ENCODERS
3380
3381     if(avctx->lowres==1){
3382         c->idct_put= ff_jref_idct4_put;
3383         c->idct_add= ff_jref_idct4_add;
3384         c->idct    = j_rev_dct4;
3385         c->idct_permutation_type= FF_NO_IDCT_PERM;
3386     }else{
3387         if(avctx->idct_algo==FF_IDCT_INT){
3388             c->idct_put= ff_jref_idct_put;
3389             c->idct_add= ff_jref_idct_add;
3390             c->idct    = j_rev_dct;
3391             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3392         }else{ //accurate/default
3393             c->idct_put= simple_idct_put;
3394             c->idct_add= simple_idct_add;
3395             c->idct    = simple_idct;
3396             c->idct_permutation_type= FF_NO_IDCT_PERM;
3397         }
3398     }
3399
3400     /* VP3 DSP support */
3401     c->vp3_dsp_init = vp3_dsp_init_c;
3402     c->vp3_idct = vp3_idct_c;
3403
3404     c->get_pixels = get_pixels_c;
3405     c->diff_pixels = diff_pixels_c;
3406     c->put_pixels_clamped = put_pixels_clamped_c;
3407     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3408     c->add_pixels_clamped = add_pixels_clamped_c;
3409     c->gmc1 = gmc1_c;
3410     c->gmc = gmc_c;
3411     c->clear_blocks = clear_blocks_c;
3412     c->pix_sum = pix_sum_c;
3413     c->pix_norm1 = pix_norm1_c;
3414
3415     /* TODO [0] 16  [1] 8 */
3416     c->pix_abs[0][0] = pix_abs16_c;
3417     c->pix_abs[0][1] = pix_abs16_x2_c;
3418     c->pix_abs[0][2] = pix_abs16_y2_c;
3419     c->pix_abs[0][3] = pix_abs16_xy2_c;
3420     c->pix_abs[1][0] = pix_abs8_c;
3421     c->pix_abs[1][1] = pix_abs8_x2_c;
3422     c->pix_abs[1][2] = pix_abs8_y2_c;
3423     c->pix_abs[1][3] = pix_abs8_xy2_c;
3424
3425 #define dspfunc(PFX, IDX, NUM) \
3426     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3427     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3428     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3429     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3430
3431     dspfunc(put, 0, 16);
3432     dspfunc(put_no_rnd, 0, 16);
3433     dspfunc(put, 1, 8);
3434     dspfunc(put_no_rnd, 1, 8);
3435     dspfunc(put, 2, 4);
3436     dspfunc(put, 3, 2);
3437
3438     dspfunc(avg, 0, 16);
3439     dspfunc(avg_no_rnd, 0, 16);
3440     dspfunc(avg, 1, 8);
3441     dspfunc(avg_no_rnd, 1, 8);
3442     dspfunc(avg, 2, 4);
3443     dspfunc(avg, 3, 2);
3444 #undef dspfunc
3445
3446     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3447     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3448
3449     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3450     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3451     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3452     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3453     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3454     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3455     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3456     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3457     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3458
3459     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3460     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3461     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3462     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3463     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3464     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3465     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3466     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3467     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3468
3469 #define dspfunc(PFX, IDX, NUM) \
3470     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3471     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3472     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3473     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3474     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3475     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3476     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3477     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3478     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3479     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3480     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3481     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3482     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3483     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3484     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3485     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3486
3487     dspfunc(put_qpel, 0, 16);
3488     dspfunc(put_no_rnd_qpel, 0, 16);
3489
3490     dspfunc(avg_qpel, 0, 16);
3491     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3492
3493     dspfunc(put_qpel, 1, 8);
3494     dspfunc(put_no_rnd_qpel, 1, 8);
3495
3496     dspfunc(avg_qpel, 1, 8);
3497     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3498
3499     dspfunc(put_h264_qpel, 0, 16);
3500     dspfunc(put_h264_qpel, 1, 8);
3501     dspfunc(put_h264_qpel, 2, 4);
3502     dspfunc(avg_h264_qpel, 0, 16);
3503     dspfunc(avg_h264_qpel, 1, 8);
3504     dspfunc(avg_h264_qpel, 2, 4);
3505
3506 #undef dspfunc
3507     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3508     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3509     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3510     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3511     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3512     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3513
3514     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3515     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3516     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3517     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3518     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3519     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3520     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3521     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3522
3523 #define SET_CMP_FUNC(name) \
3524     c->name[0]= name ## 16_c;\
3525     c->name[1]= name ## 8x8_c;
3526
3527     SET_CMP_FUNC(hadamard8_diff)
3528     c->hadamard8_diff[4]= hadamard8_intra16_c;
3529     SET_CMP_FUNC(dct_sad)
3530     c->sad[0]= pix_abs16_c;
3531     c->sad[1]= pix_abs8_c;
3532     c->sse[0]= sse16_c;
3533     c->sse[1]= sse8_c;
3534     c->sse[2]= sse4_c;
3535     SET_CMP_FUNC(quant_psnr)
3536     SET_CMP_FUNC(rd)
3537     SET_CMP_FUNC(bit)
3538     c->vsad[0]= vsad16_c;
3539     c->vsad[4]= vsad_intra16_c;
3540     c->vsse[0]= vsse16_c;
3541     c->vsse[4]= vsse_intra16_c;
3542     c->nsse[0]= nsse16_c;
3543     c->nsse[1]= nsse8_c;
3544     c->w53[0]= w53_16_c;
3545     c->w53[1]= w53_8_c;
3546     c->w97[0]= w97_16_c;
3547     c->w97[1]= w97_8_c;
3548
3549     c->add_bytes= add_bytes_c;
3550     c->diff_bytes= diff_bytes_c;
3551     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3552     c->bswap_buf= bswap_buf;
3553
3554     c->h263_h_loop_filter= h263_h_loop_filter_c;
3555     c->h263_v_loop_filter= h263_v_loop_filter_c;
3556
3557     c->h261_loop_filter= h261_loop_filter_c;
3558
3559     c->try_8x8basis= try_8x8basis_c;
3560     c->add_8x8basis= add_8x8basis_c;
3561
3562 #ifdef HAVE_MMX
3563     dsputil_init_mmx(c, avctx);
3564 #endif
3565 #ifdef ARCH_ARMV4L
3566     dsputil_init_armv4l(c, avctx);
3567 #endif
3568 #ifdef HAVE_MLIB
3569     dsputil_init_mlib(c, avctx);
3570 #endif
3571 #ifdef ARCH_SPARC
3572    dsputil_init_vis(c,avctx);
3573 #endif
3574 #ifdef ARCH_ALPHA
3575     dsputil_init_alpha(c, avctx);
3576 #endif
3577 #ifdef ARCH_POWERPC
3578     dsputil_init_ppc(c, avctx);
3579 #endif
3580 #ifdef HAVE_MMI
3581     dsputil_init_mmi(c, avctx);
3582 #endif
3583 #ifdef ARCH_SH4
3584     dsputil_init_sh4(c,avctx);
3585 #endif
3586
3587     switch(c->idct_permutation_type){
3588     case FF_NO_IDCT_PERM:
3589         for(i=0; i<64; i++)
3590             c->idct_permutation[i]= i;
3591         break;
3592     case FF_LIBMPEG2_IDCT_PERM:
3593         for(i=0; i<64; i++)
3594             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3595         break;
3596     case FF_SIMPLE_IDCT_PERM:
3597         for(i=0; i<64; i++)
3598             c->idct_permutation[i]= simple_mmx_permutation[i];
3599         break;
3600     case FF_TRANSPOSE_IDCT_PERM:
3601         for(i=0; i<64; i++)
3602             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3603         break;
3604     default:
3605         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3606     }
3607 }
3608