git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  *
  20  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  21  */
  22
  23 /**
  24  * @file dsputil.c
  25  * DSP utils
  26  */
  27
  28 #include "avcodec.h"
  29 #include "dsputil.h"
  30 #include "mpegvideo.h"
  31 #include "simple_idct.h"
  32 #include "faandct.h"
  33 #include "snow.h"
  34
  35 /* snow.c */
  36 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  37
  38 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  39 uint32_t squareTbl[512] = {0, };
  40
  41 const uint8_t ff_zigzag_direct[64] = {
  42     0,   1,  8, 16,  9,  2,  3, 10,
  43     17, 24, 32, 25, 18, 11,  4,  5,
  44     12, 19, 26, 33, 40, 48, 41, 34,
  45     27, 20, 13,  6,  7, 14, 21, 28,
  46     35, 42, 49, 56, 57, 50, 43, 36,
  47     29, 22, 15, 23, 30, 37, 44, 51,
  48     58, 59, 52, 45, 38, 31, 39, 46,
  49     53, 60, 61, 54, 47, 55, 62, 63
  50 };
  51
  52 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  53    specification, we interleave the fields */
  54 const uint8_t ff_zigzag248_direct[64] = {
  55      0,  8,  1,  9, 16, 24,  2, 10,
  56     17, 25, 32, 40, 48, 56, 33, 41,
  57     18, 26,  3, 11,  4, 12, 19, 27,
  58     34, 42, 49, 57, 50, 58, 35, 43,
  59     20, 28,  5, 13,  6, 14, 21, 29,
  60     36, 44, 51, 59, 52, 60, 37, 45,
  61     22, 30,  7, 15, 23, 31, 38, 46,
  62     53, 61, 54, 62, 39, 47, 55, 63,
  63 };
  64
  65 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  66 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
  67
  68 const uint8_t ff_alternate_horizontal_scan[64] = {
  69     0,  1,   2,  3,  8,  9, 16, 17,
  70     10, 11,  4,  5,  6,  7, 15, 14,
  71     13, 12, 19, 18, 24, 25, 32, 33,
  72     26, 27, 20, 21, 22, 23, 28, 29,
  73     30, 31, 34, 35, 40, 41, 48, 49,
  74     42, 43, 36, 37, 38, 39, 44, 45,
  75     46, 47, 50, 51, 56, 57, 58, 59,
  76     52, 53, 54, 55, 60, 61, 62, 63,
  77 };
  78
  79 const uint8_t ff_alternate_vertical_scan[64] = {
  80     0,  8,  16, 24,  1,  9,  2, 10,
  81     17, 25, 32, 40, 48, 56, 57, 49,
  82     41, 33, 26, 18,  3, 11,  4, 12,
  83     19, 27, 34, 42, 50, 58, 35, 43,
  84     51, 59, 20, 28,  5, 13,  6, 14,
  85     21, 29, 36, 44, 52, 60, 37, 45,
  86     53, 61, 22, 30,  7, 15, 23, 31,
  87     38, 46, 54, 62, 39, 47, 55, 63,
  88 };
  89
  90 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  91 const uint32_t inverse[256]={
  92          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  93  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
  94  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
  95  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
  96  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
  97  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
  98   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
  99   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 100   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 101   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 102   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 103   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 104   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 105   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 106   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 107   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 108   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 109   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 110   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 111   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 112   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 113   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 114   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 115   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 116   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 117   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 118   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 119   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 120   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 121   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 122   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 123   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 124 };
 125
 126 /* Input permutation for the simple_idct_mmx */
 127 static const uint8_t simple_mmx_permutation[64]={
 128         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 129         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 130         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 131         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 132         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 133         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 134         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 135         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 136 };
 137
 138 static int pix_sum_c(uint8_t * pix, int line_size)
 139 {
 140     int s, i, j;
 141
 142     s = 0;
 143     for (i = 0; i < 16; i++) {
 144         for (j = 0; j < 16; j += 8) {
 145             s += pix[0];
 146             s += pix[1];
 147             s += pix[2];
 148             s += pix[3];
 149             s += pix[4];
 150             s += pix[5];
 151             s += pix[6];
 152             s += pix[7];
 153             pix += 8;
 154         }
 155         pix += line_size - 16;
 156     }
 157     return s;
 158 }
 159
 160 static int pix_norm1_c(uint8_t * pix, int line_size)
 161 {
 162     int s, i, j;
 163     uint32_t *sq = squareTbl + 256;
 164
 165     s = 0;
 166     for (i = 0; i < 16; i++) {
 167         for (j = 0; j < 16; j += 8) {
 168 #if 0
 169             s += sq[pix[0]];
 170             s += sq[pix[1]];
 171             s += sq[pix[2]];
 172             s += sq[pix[3]];
 173             s += sq[pix[4]];
 174             s += sq[pix[5]];
 175             s += sq[pix[6]];
 176             s += sq[pix[7]];
 177 #else
 178 #if LONG_MAX > 2147483647
 179             register uint64_t x=*(uint64_t*)pix;
 180             s += sq[x&0xff];
 181             s += sq[(x>>8)&0xff];
 182             s += sq[(x>>16)&0xff];
 183             s += sq[(x>>24)&0xff];
 184             s += sq[(x>>32)&0xff];
 185             s += sq[(x>>40)&0xff];
 186             s += sq[(x>>48)&0xff];
 187             s += sq[(x>>56)&0xff];
 188 #else
 189             register uint32_t x=*(uint32_t*)pix;
 190             s += sq[x&0xff];
 191             s += sq[(x>>8)&0xff];
 192             s += sq[(x>>16)&0xff];
 193             s += sq[(x>>24)&0xff];
 194             x=*(uint32_t*)(pix+4);
 195             s += sq[x&0xff];
 196             s += sq[(x>>8)&0xff];
 197             s += sq[(x>>16)&0xff];
 198             s += sq[(x>>24)&0xff];
 199 #endif
 200 #endif
 201             pix += 8;
 202         }
 203         pix += line_size - 16;
 204     }
 205     return s;
 206 }
 207
 208 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 209     int i;
 210
 211     for(i=0; i+8<=w; i+=8){
 212         dst[i+0]= bswap_32(src[i+0]);
 213         dst[i+1]= bswap_32(src[i+1]);
 214         dst[i+2]= bswap_32(src[i+2]);
 215         dst[i+3]= bswap_32(src[i+3]);
 216         dst[i+4]= bswap_32(src[i+4]);
 217         dst[i+5]= bswap_32(src[i+5]);
 218         dst[i+6]= bswap_32(src[i+6]);
 219         dst[i+7]= bswap_32(src[i+7]);
 220     }
 221     for(;i<w; i++){
 222         dst[i+0]= bswap_32(src[i+0]);
 223     }
 224 }
 225
 226 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 227 {
 228     int s, i;
 229     uint32_t *sq = squareTbl + 256;
 230
 231     s = 0;
 232     for (i = 0; i < h; i++) {
 233         s += sq[pix1[0] - pix2[0]];
 234         s += sq[pix1[1] - pix2[1]];
 235         s += sq[pix1[2] - pix2[2]];
 236         s += sq[pix1[3] - pix2[3]];
 237         pix1 += line_size;
 238         pix2 += line_size;
 239     }
 240     return s;
 241 }
 242
 243 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 244 {
 245     int s, i;
 246     uint32_t *sq = squareTbl + 256;
 247
 248     s = 0;
 249     for (i = 0; i < h; i++) {
 250         s += sq[pix1[0] - pix2[0]];
 251         s += sq[pix1[1] - pix2[1]];
 252         s += sq[pix1[2] - pix2[2]];
 253         s += sq[pix1[3] - pix2[3]];
 254         s += sq[pix1[4] - pix2[4]];
 255         s += sq[pix1[5] - pix2[5]];
 256         s += sq[pix1[6] - pix2[6]];
 257         s += sq[pix1[7] - pix2[7]];
 258         pix1 += line_size;
 259         pix2 += line_size;
 260     }
 261     return s;
 262 }
 263
 264 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 265 {
 266     int s, i;
 267     uint32_t *sq = squareTbl + 256;
 268
 269     s = 0;
 270     for (i = 0; i < h; i++) {
 271         s += sq[pix1[ 0] - pix2[ 0]];
 272         s += sq[pix1[ 1] - pix2[ 1]];
 273         s += sq[pix1[ 2] - pix2[ 2]];
 274         s += sq[pix1[ 3] - pix2[ 3]];
 275         s += sq[pix1[ 4] - pix2[ 4]];
 276         s += sq[pix1[ 5] - pix2[ 5]];
 277         s += sq[pix1[ 6] - pix2[ 6]];
 278         s += sq[pix1[ 7] - pix2[ 7]];
 279         s += sq[pix1[ 8] - pix2[ 8]];
 280         s += sq[pix1[ 9] - pix2[ 9]];
 281         s += sq[pix1[10] - pix2[10]];
 282         s += sq[pix1[11] - pix2[11]];
 283         s += sq[pix1[12] - pix2[12]];
 284         s += sq[pix1[13] - pix2[13]];
 285         s += sq[pix1[14] - pix2[14]];
 286         s += sq[pix1[15] - pix2[15]];
 287
 288         pix1 += line_size;
 289         pix2 += line_size;
 290     }
 291     return s;
 292 }
 293
 294
 295 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 296 #ifdef CONFIG_SNOW_ENCODER //idwt is in snow.c
 297     int s, i, j;
 298     const int dec_count= w==8 ? 3 : 4;
 299     int tmp[16*16];
 300 #if 0
 301     int level, ori;
 302     static const int scale[2][2][4][4]={
 303       {
 304         {
 305             //8x8 dec=3
 306             {268, 239, 239, 213},
 307             {  0, 224, 224, 152},
 308             {  0, 135, 135, 110},
 309         },{
 310             //16x16 dec=4
 311             {344, 310, 310, 280},
 312             {  0, 320, 320, 228},
 313             {  0, 175, 175, 136},
 314             {  0, 129, 129, 102},
 315         }
 316       },{
 317         {//FIXME 5/3
 318             //8x8 dec=3
 319             {275, 245, 245, 218},
 320             {  0, 230, 230, 156},
 321             {  0, 138, 138, 113},
 322         },{
 323             //16x16 dec=4
 324             {352, 317, 317, 286},
 325             {  0, 328, 328, 233},
 326             {  0, 180, 180, 140},
 327             {  0, 132, 132, 105},
 328         }
 329       }
 330     };
 331 #endif
 332
 333     for (i = 0; i < h; i++) {
 334         for (j = 0; j < w; j+=4) {
 335             tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 336             tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 337             tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 338             tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 339         }
 340         pix1 += line_size;
 341         pix2 += line_size;
 342     }
 343
 344     ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
 345
 346     s=0;
 347 #if 0
 348     for(level=0; level<dec_count; level++){
 349         for(ori= level ? 1 : 0; ori<4; ori++){
 350             int sx= (ori&1) ? 1<<level: 0;
 351             int stride= 16<<(dec_count-level);
 352             int sy= (ori&2) ? stride>>1 : 0;
 353             int size= 1<<level;
 354
 355             for(i=0; i<size; i++){
 356                 for(j=0; j<size; j++){
 357                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 358                     s += ABS(v);
 359                 }
 360             }
 361         }
 362     }
 363 #endif
 364     for (i = 0; i < h; i++) {
 365         for (j = 0; j < w; j+=4) {
 366             s+= ABS(tmp[16*i+j+0]);
 367             s+= ABS(tmp[16*i+j+1]);
 368             s+= ABS(tmp[16*i+j+2]);
 369             s+= ABS(tmp[16*i+j+3]);
 370         }
 371     }
 372     assert(s>=0);
 373
 374     return s>>2;
 375 #endif
 376 }
 377
 378 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 379     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 380 }
 381
 382 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 383     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 384 }
 385
 386 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 387     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 388 }
 389
 390 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 391     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 392 }
 393
 394 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 395 {
 396     int i;
 397
 398     /* read the pixels */
 399     for(i=0;i<8;i++) {
 400         block[0] = pixels[0];
 401         block[1] = pixels[1];
 402         block[2] = pixels[2];
 403         block[3] = pixels[3];
 404         block[4] = pixels[4];
 405         block[5] = pixels[5];
 406         block[6] = pixels[6];
 407         block[7] = pixels[7];
 408         pixels += line_size;
 409         block += 8;
 410     }
 411 }
 412
 413 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 414                           const uint8_t *s2, int stride){
 415     int i;
 416
 417     /* read the pixels */
 418     for(i=0;i<8;i++) {
 419         block[0] = s1[0] - s2[0];
 420         block[1] = s1[1] - s2[1];
 421         block[2] = s1[2] - s2[2];
 422         block[3] = s1[3] - s2[3];
 423         block[4] = s1[4] - s2[4];
 424         block[5] = s1[5] - s2[5];
 425         block[6] = s1[6] - s2[6];
 426         block[7] = s1[7] - s2[7];
 427         s1 += stride;
 428         s2 += stride;
 429         block += 8;
 430     }
 431 }
 432
 433
 434 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 435                                  int line_size)
 436 {
 437     int i;
 438     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 439
 440     /* read the pixels */
 441     for(i=0;i<8;i++) {
 442         pixels[0] = cm[block[0]];
 443         pixels[1] = cm[block[1]];
 444         pixels[2] = cm[block[2]];
 445         pixels[3] = cm[block[3]];
 446         pixels[4] = cm[block[4]];
 447         pixels[5] = cm[block[5]];
 448         pixels[6] = cm[block[6]];
 449         pixels[7] = cm[block[7]];
 450
 451         pixels += line_size;
 452         block += 8;
 453     }
 454 }
 455
 456 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 457                                  int line_size)
 458 {
 459     int i;
 460     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 461
 462     /* read the pixels */
 463     for(i=0;i<4;i++) {
 464         pixels[0] = cm[block[0]];
 465         pixels[1] = cm[block[1]];
 466         pixels[2] = cm[block[2]];
 467         pixels[3] = cm[block[3]];
 468
 469         pixels += line_size;
 470         block += 8;
 471     }
 472 }
 473
 474 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 475                                  int line_size)
 476 {
 477     int i;
 478     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 479
 480     /* read the pixels */
 481     for(i=0;i<2;i++) {
 482         pixels[0] = cm[block[0]];
 483         pixels[1] = cm[block[1]];
 484
 485         pixels += line_size;
 486         block += 8;
 487     }
 488 }
 489
 490 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 491                                         uint8_t *restrict pixels,
 492                                         int line_size)
 493 {
 494     int i, j;
 495
 496     for (i = 0; i < 8; i++) {
 497         for (j = 0; j < 8; j++) {
 498             if (*block < -128)
 499                 *pixels = 0;
 500             else if (*block > 127)
 501                 *pixels = 255;
 502             else
 503                 *pixels = (uint8_t)(*block + 128);
 504             block++;
 505             pixels++;
 506         }
 507         pixels += (line_size - 8);
 508     }
 509 }
 510
 511 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 512                           int line_size)
 513 {
 514     int i;
 515     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 516
 517     /* read the pixels */
 518     for(i=0;i<8;i++) {
 519         pixels[0] = cm[pixels[0] + block[0]];
 520         pixels[1] = cm[pixels[1] + block[1]];
 521         pixels[2] = cm[pixels[2] + block[2]];
 522         pixels[3] = cm[pixels[3] + block[3]];
 523         pixels[4] = cm[pixels[4] + block[4]];
 524         pixels[5] = cm[pixels[5] + block[5]];
 525         pixels[6] = cm[pixels[6] + block[6]];
 526         pixels[7] = cm[pixels[7] + block[7]];
 527         pixels += line_size;
 528         block += 8;
 529     }
 530 }
 531
 532 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 533                           int line_size)
 534 {
 535     int i;
 536     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 537
 538     /* read the pixels */
 539     for(i=0;i<4;i++) {
 540         pixels[0] = cm[pixels[0] + block[0]];
 541         pixels[1] = cm[pixels[1] + block[1]];
 542         pixels[2] = cm[pixels[2] + block[2]];
 543         pixels[3] = cm[pixels[3] + block[3]];
 544         pixels += line_size;
 545         block += 8;
 546     }
 547 }
 548
 549 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 550                           int line_size)
 551 {
 552     int i;
 553     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 554
 555     /* read the pixels */
 556     for(i=0;i<2;i++) {
 557         pixels[0] = cm[pixels[0] + block[0]];
 558         pixels[1] = cm[pixels[1] + block[1]];
 559         pixels += line_size;
 560         block += 8;
 561     }
 562 }
 563
 564 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 565 {
 566     int i;
 567     for(i=0;i<8;i++) {
 568         pixels[0] += block[0];
 569         pixels[1] += block[1];
 570         pixels[2] += block[2];
 571         pixels[3] += block[3];
 572         pixels[4] += block[4];
 573         pixels[5] += block[5];
 574         pixels[6] += block[6];
 575         pixels[7] += block[7];
 576         pixels += line_size;
 577         block += 8;
 578     }
 579 }
 580
 581 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 582 {
 583     int i;
 584     for(i=0;i<4;i++) {
 585         pixels[0] += block[0];
 586         pixels[1] += block[1];
 587         pixels[2] += block[2];
 588         pixels[3] += block[3];
 589         pixels += line_size;
 590         block += 4;
 591     }
 592 }
 593
 594 #if 0
 595
 596 #define PIXOP2(OPNAME, OP) \
 597 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 598 {\
 599     int i;\
 600     for(i=0; i<h; i++){\
 601         OP(*((uint64_t*)block), LD64(pixels));\
 602         pixels+=line_size;\
 603         block +=line_size;\
 604     }\
 605 }\
 606 \
 607 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 608 {\
 609     int i;\
 610     for(i=0; i<h; i++){\
 611         const uint64_t a= LD64(pixels  );\
 612         const uint64_t b= LD64(pixels+1);\
 613         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 614         pixels+=line_size;\
 615         block +=line_size;\
 616     }\
 617 }\
 618 \
 619 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 620 {\
 621     int i;\
 622     for(i=0; i<h; i++){\
 623         const uint64_t a= LD64(pixels  );\
 624         const uint64_t b= LD64(pixels+1);\
 625         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 626         pixels+=line_size;\
 627         block +=line_size;\
 628     }\
 629 }\
 630 \
 631 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 632 {\
 633     int i;\
 634     for(i=0; i<h; i++){\
 635         const uint64_t a= LD64(pixels          );\
 636         const uint64_t b= LD64(pixels+line_size);\
 637         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 638         pixels+=line_size;\
 639         block +=line_size;\
 640     }\
 641 }\
 642 \
 643 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 644 {\
 645     int i;\
 646     for(i=0; i<h; i++){\
 647         const uint64_t a= LD64(pixels          );\
 648         const uint64_t b= LD64(pixels+line_size);\
 649         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 650         pixels+=line_size;\
 651         block +=line_size;\
 652     }\
 653 }\
 654 \
 655 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 656 {\
 657         int i;\
 658         const uint64_t a= LD64(pixels  );\
 659         const uint64_t b= LD64(pixels+1);\
 660         uint64_t l0=  (a&0x0303030303030303ULL)\
 661                     + (b&0x0303030303030303ULL)\
 662                     + 0x0202020202020202ULL;\
 663         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 664                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 665         uint64_t l1,h1;\
 666 \
 667         pixels+=line_size;\
 668         for(i=0; i<h; i+=2){\
 669             uint64_t a= LD64(pixels  );\
 670             uint64_t b= LD64(pixels+1);\
 671             l1=  (a&0x0303030303030303ULL)\
 672                + (b&0x0303030303030303ULL);\
 673             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 674               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 675             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 676             pixels+=line_size;\
 677             block +=line_size;\
 678             a= LD64(pixels  );\
 679             b= LD64(pixels+1);\
 680             l0=  (a&0x0303030303030303ULL)\
 681                + (b&0x0303030303030303ULL)\
 682                + 0x0202020202020202ULL;\
 683             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 684               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 685             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 686             pixels+=line_size;\
 687             block +=line_size;\
 688         }\
 689 }\
 690 \
 691 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 692 {\
 693         int i;\
 694         const uint64_t a= LD64(pixels  );\
 695         const uint64_t b= LD64(pixels+1);\
 696         uint64_t l0=  (a&0x0303030303030303ULL)\
 697                     + (b&0x0303030303030303ULL)\
 698                     + 0x0101010101010101ULL;\
 699         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 700                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 701         uint64_t l1,h1;\
 702 \
 703         pixels+=line_size;\
 704         for(i=0; i<h; i+=2){\
 705             uint64_t a= LD64(pixels  );\
 706             uint64_t b= LD64(pixels+1);\
 707             l1=  (a&0x0303030303030303ULL)\
 708                + (b&0x0303030303030303ULL);\
 709             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 710               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 711             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 712             pixels+=line_size;\
 713             block +=line_size;\
 714             a= LD64(pixels  );\
 715             b= LD64(pixels+1);\
 716             l0=  (a&0x0303030303030303ULL)\
 717                + (b&0x0303030303030303ULL)\
 718                + 0x0101010101010101ULL;\
 719             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 720               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 721             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 722             pixels+=line_size;\
 723             block +=line_size;\
 724         }\
 725 }\
 726 \
 727 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 728 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 729 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 730 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 731 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 732 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 733 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 734
 735 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 736 #else // 64 bit variant
 737
 738 #define PIXOP2(OPNAME, OP) \
 739 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 740     int i;\
 741     for(i=0; i<h; i++){\
 742         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
 743         pixels+=line_size;\
 744         block +=line_size;\
 745     }\
 746 }\
 747 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 748     int i;\
 749     for(i=0; i<h; i++){\
 750         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 751         pixels+=line_size;\
 752         block +=line_size;\
 753     }\
 754 }\
 755 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 756     int i;\
 757     for(i=0; i<h; i++){\
 758         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 759         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 760         pixels+=line_size;\
 761         block +=line_size;\
 762     }\
 763 }\
 764 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 765     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 766 }\
 767 \
 768 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 769                                                 int src_stride1, int src_stride2, int h){\
 770     int i;\
 771     for(i=0; i<h; i++){\
 772         uint32_t a,b;\
 773         a= LD32(&src1[i*src_stride1  ]);\
 774         b= LD32(&src2[i*src_stride2  ]);\
 775         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 776         a= LD32(&src1[i*src_stride1+4]);\
 777         b= LD32(&src2[i*src_stride2+4]);\
 778         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 779     }\
 780 }\
 781 \
 782 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 783                                                 int src_stride1, int src_stride2, int h){\
 784     int i;\
 785     for(i=0; i<h; i++){\
 786         uint32_t a,b;\
 787         a= LD32(&src1[i*src_stride1  ]);\
 788         b= LD32(&src2[i*src_stride2  ]);\
 789         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 790         a= LD32(&src1[i*src_stride1+4]);\
 791         b= LD32(&src2[i*src_stride2+4]);\
 792         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 793     }\
 794 }\
 795 \
 796 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 797                                                 int src_stride1, int src_stride2, int h){\
 798     int i;\
 799     for(i=0; i<h; i++){\
 800         uint32_t a,b;\
 801         a= LD32(&src1[i*src_stride1  ]);\
 802         b= LD32(&src2[i*src_stride2  ]);\
 803         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 804     }\
 805 }\
 806 \
 807 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 808                                                 int src_stride1, int src_stride2, int h){\
 809     int i;\
 810     for(i=0; i<h; i++){\
 811         uint32_t a,b;\
 812         a= LD16(&src1[i*src_stride1  ]);\
 813         b= LD16(&src2[i*src_stride2  ]);\
 814         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 815     }\
 816 }\
 817 \
 818 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 819                                                 int src_stride1, int src_stride2, int h){\
 820     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 821     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 822 }\
 823 \
 824 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 825                                                 int src_stride1, int src_stride2, int h){\
 826     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 827     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 828 }\
 829 \
 830 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 831     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 832 }\
 833 \
 834 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 835     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 836 }\
 837 \
 838 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 839     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 840 }\
 841 \
 842 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 843     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 844 }\
 845 \
 846 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 847                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 848     int i;\
 849     for(i=0; i<h; i++){\
 850         uint32_t a, b, c, d, l0, l1, h0, h1;\
 851         a= LD32(&src1[i*src_stride1]);\
 852         b= LD32(&src2[i*src_stride2]);\
 853         c= LD32(&src3[i*src_stride3]);\
 854         d= LD32(&src4[i*src_stride4]);\
 855         l0=  (a&0x03030303UL)\
 856            + (b&0x03030303UL)\
 857            + 0x02020202UL;\
 858         h0= ((a&0xFCFCFCFCUL)>>2)\
 859           + ((b&0xFCFCFCFCUL)>>2);\
 860         l1=  (c&0x03030303UL)\
 861            + (d&0x03030303UL);\
 862         h1= ((c&0xFCFCFCFCUL)>>2)\
 863           + ((d&0xFCFCFCFCUL)>>2);\
 864         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 865         a= LD32(&src1[i*src_stride1+4]);\
 866         b= LD32(&src2[i*src_stride2+4]);\
 867         c= LD32(&src3[i*src_stride3+4]);\
 868         d= LD32(&src4[i*src_stride4+4]);\
 869         l0=  (a&0x03030303UL)\
 870            + (b&0x03030303UL)\
 871            + 0x02020202UL;\
 872         h0= ((a&0xFCFCFCFCUL)>>2)\
 873           + ((b&0xFCFCFCFCUL)>>2);\
 874         l1=  (c&0x03030303UL)\
 875            + (d&0x03030303UL);\
 876         h1= ((c&0xFCFCFCFCUL)>>2)\
 877           + ((d&0xFCFCFCFCUL)>>2);\
 878         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 879     }\
 880 }\
 881 \
 882 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 883     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 884 }\
 885 \
 886 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 887     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 888 }\
 889 \
 890 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 891     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 892 }\
 893 \
 894 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 895     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 896 }\
 897 \
 898 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 899                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 900     int i;\
 901     for(i=0; i<h; i++){\
 902         uint32_t a, b, c, d, l0, l1, h0, h1;\
 903         a= LD32(&src1[i*src_stride1]);\
 904         b= LD32(&src2[i*src_stride2]);\
 905         c= LD32(&src3[i*src_stride3]);\
 906         d= LD32(&src4[i*src_stride4]);\
 907         l0=  (a&0x03030303UL)\
 908            + (b&0x03030303UL)\
 909            + 0x01010101UL;\
 910         h0= ((a&0xFCFCFCFCUL)>>2)\
 911           + ((b&0xFCFCFCFCUL)>>2);\
 912         l1=  (c&0x03030303UL)\
 913            + (d&0x03030303UL);\
 914         h1= ((c&0xFCFCFCFCUL)>>2)\
 915           + ((d&0xFCFCFCFCUL)>>2);\
 916         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 917         a= LD32(&src1[i*src_stride1+4]);\
 918         b= LD32(&src2[i*src_stride2+4]);\
 919         c= LD32(&src3[i*src_stride3+4]);\
 920         d= LD32(&src4[i*src_stride4+4]);\
 921         l0=  (a&0x03030303UL)\
 922            + (b&0x03030303UL)\
 923            + 0x01010101UL;\
 924         h0= ((a&0xFCFCFCFCUL)>>2)\
 925           + ((b&0xFCFCFCFCUL)>>2);\
 926         l1=  (c&0x03030303UL)\
 927            + (d&0x03030303UL);\
 928         h1= ((c&0xFCFCFCFCUL)>>2)\
 929           + ((d&0xFCFCFCFCUL)>>2);\
 930         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 931     }\
 932 }\
 933 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 934                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 935     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 936     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 937 }\
 938 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 939                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 940     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 941     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 942 }\
 943 \
 944 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 945 {\
 946         int i, a0, b0, a1, b1;\
 947         a0= pixels[0];\
 948         b0= pixels[1] + 2;\
 949         a0 += b0;\
 950         b0 += pixels[2];\
 951 \
 952         pixels+=line_size;\
 953         for(i=0; i<h; i+=2){\
 954             a1= pixels[0];\
 955             b1= pixels[1];\
 956             a1 += b1;\
 957             b1 += pixels[2];\
 958 \
 959             block[0]= (a1+a0)>>2; /* FIXME non put */\
 960             block[1]= (b1+b0)>>2;\
 961 \
 962             pixels+=line_size;\
 963             block +=line_size;\
 964 \
 965             a0= pixels[0];\
 966             b0= pixels[1] + 2;\
 967             a0 += b0;\
 968             b0 += pixels[2];\
 969 \
 970             block[0]= (a1+a0)>>2;\
 971             block[1]= (b1+b0)>>2;\
 972             pixels+=line_size;\
 973             block +=line_size;\
 974         }\
 975 }\
 976 \
 977 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 978 {\
 979         int i;\
 980         const uint32_t a= LD32(pixels  );\
 981         const uint32_t b= LD32(pixels+1);\
 982         uint32_t l0=  (a&0x03030303UL)\
 983                     + (b&0x03030303UL)\
 984                     + 0x02020202UL;\
 985         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 986                    + ((b&0xFCFCFCFCUL)>>2);\
 987         uint32_t l1,h1;\
 988 \
 989         pixels+=line_size;\
 990         for(i=0; i<h; i+=2){\
 991             uint32_t a= LD32(pixels  );\
 992             uint32_t b= LD32(pixels+1);\
 993             l1=  (a&0x03030303UL)\
 994                + (b&0x03030303UL);\
 995             h1= ((a&0xFCFCFCFCUL)>>2)\
 996               + ((b&0xFCFCFCFCUL)>>2);\
 997             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 998             pixels+=line_size;\
 999             block +=line_size;\
1000             a= LD32(pixels  );\
1001             b= LD32(pixels+1);\
1002             l0=  (a&0x03030303UL)\
1003                + (b&0x03030303UL)\
1004                + 0x02020202UL;\
1005             h0= ((a&0xFCFCFCFCUL)>>2)\
1006               + ((b&0xFCFCFCFCUL)>>2);\
1007             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1008             pixels+=line_size;\
1009             block +=line_size;\
1010         }\
1011 }\
1012 \
1013 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1014 {\
1015     int j;\
1016     for(j=0; j<2; j++){\
1017         int i;\
1018         const uint32_t a= LD32(pixels  );\
1019         const uint32_t b= LD32(pixels+1);\
1020         uint32_t l0=  (a&0x03030303UL)\
1021                     + (b&0x03030303UL)\
1022                     + 0x02020202UL;\
1023         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1024                    + ((b&0xFCFCFCFCUL)>>2);\
1025         uint32_t l1,h1;\
1026 \
1027         pixels+=line_size;\
1028         for(i=0; i<h; i+=2){\
1029             uint32_t a= LD32(pixels  );\
1030             uint32_t b= LD32(pixels+1);\
1031             l1=  (a&0x03030303UL)\
1032                + (b&0x03030303UL);\
1033             h1= ((a&0xFCFCFCFCUL)>>2)\
1034               + ((b&0xFCFCFCFCUL)>>2);\
1035             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1036             pixels+=line_size;\
1037             block +=line_size;\
1038             a= LD32(pixels  );\
1039             b= LD32(pixels+1);\
1040             l0=  (a&0x03030303UL)\
1041                + (b&0x03030303UL)\
1042                + 0x02020202UL;\
1043             h0= ((a&0xFCFCFCFCUL)>>2)\
1044               + ((b&0xFCFCFCFCUL)>>2);\
1045             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1046             pixels+=line_size;\
1047             block +=line_size;\
1048         }\
1049         pixels+=4-line_size*(h+1);\
1050         block +=4-line_size*h;\
1051     }\
1052 }\
1053 \
1054 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1055 {\
1056     int j;\
1057     for(j=0; j<2; j++){\
1058         int i;\
1059         const uint32_t a= LD32(pixels  );\
1060         const uint32_t b= LD32(pixels+1);\
1061         uint32_t l0=  (a&0x03030303UL)\
1062                     + (b&0x03030303UL)\
1063                     + 0x01010101UL;\
1064         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1065                    + ((b&0xFCFCFCFCUL)>>2);\
1066         uint32_t l1,h1;\
1067 \
1068         pixels+=line_size;\
1069         for(i=0; i<h; i+=2){\
1070             uint32_t a= LD32(pixels  );\
1071             uint32_t b= LD32(pixels+1);\
1072             l1=  (a&0x03030303UL)\
1073                + (b&0x03030303UL);\
1074             h1= ((a&0xFCFCFCFCUL)>>2)\
1075               + ((b&0xFCFCFCFCUL)>>2);\
1076             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1077             pixels+=line_size;\
1078             block +=line_size;\
1079             a= LD32(pixels  );\
1080             b= LD32(pixels+1);\
1081             l0=  (a&0x03030303UL)\
1082                + (b&0x03030303UL)\
1083                + 0x01010101UL;\
1084             h0= ((a&0xFCFCFCFCUL)>>2)\
1085               + ((b&0xFCFCFCFCUL)>>2);\
1086             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1087             pixels+=line_size;\
1088             block +=line_size;\
1089         }\
1090         pixels+=4-line_size*(h+1);\
1091         block +=4-line_size*h;\
1092     }\
1093 }\
1094 \
1095 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1096 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1097 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1098 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1099 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1100 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1101 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1102 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1103
1104 #define op_avg(a, b) a = rnd_avg32(a, b)
1105 #endif
1106 #define op_put(a, b) a = b
1107
1108 PIXOP2(avg, op_avg)
1109 PIXOP2(put, op_put)
1110 #undef op_avg
1111 #undef op_put
1112
1113 #define avg2(a,b) ((a+b+1)>>1)
1114 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1115
1116 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1117     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1118 }
1119
1120 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1121     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1122 }
1123
1124 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1125 {
1126     const int A=(16-x16)*(16-y16);
1127     const int B=(   x16)*(16-y16);
1128     const int C=(16-x16)*(   y16);
1129     const int D=(   x16)*(   y16);
1130     int i;
1131
1132     for(i=0; i<h; i++)
1133     {
1134         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1135         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1136         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1137         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1138         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1139         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1140         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1141         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1142         dst+= stride;
1143         src+= stride;
1144     }
1145 }
1146
1147 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1148                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1149 {
1150     int y, vx, vy;
1151     const int s= 1<<shift;
1152
1153     width--;
1154     height--;
1155
1156     for(y=0; y<h; y++){
1157         int x;
1158
1159         vx= ox;
1160         vy= oy;
1161         for(x=0; x<8; x++){ //XXX FIXME optimize
1162             int src_x, src_y, frac_x, frac_y, index;
1163
1164             src_x= vx>>16;
1165             src_y= vy>>16;
1166             frac_x= src_x&(s-1);
1167             frac_y= src_y&(s-1);
1168             src_x>>=shift;
1169             src_y>>=shift;
1170
1171             if((unsigned)src_x < width){
1172                 if((unsigned)src_y < height){
1173                     index= src_x + src_y*stride;
1174                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1175                                            + src[index       +1]*   frac_x )*(s-frac_y)
1176                                         + (  src[index+stride  ]*(s-frac_x)
1177                                            + src[index+stride+1]*   frac_x )*   frac_y
1178                                         + r)>>(shift*2);
1179                 }else{
1180                     index= src_x + clip(src_y, 0, height)*stride;
1181                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1182                                           + src[index       +1]*   frac_x )*s
1183                                         + r)>>(shift*2);
1184                 }
1185             }else{
1186                 if((unsigned)src_y < height){
1187                     index= clip(src_x, 0, width) + src_y*stride;
1188                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1189                                            + src[index+stride  ]*   frac_y )*s
1190                                         + r)>>(shift*2);
1191                 }else{
1192                     index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1193                     dst[y*stride + x]=    src[index         ];
1194                 }
1195             }
1196
1197             vx+= dxx;
1198             vy+= dyx;
1199         }
1200         ox += dxy;
1201         oy += dyy;
1202     }
1203 }
1204
1205 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1206     switch(width){
1207     case 2: put_pixels2_c (dst, src, stride, height); break;
1208     case 4: put_pixels4_c (dst, src, stride, height); break;
1209     case 8: put_pixels8_c (dst, src, stride, height); break;
1210     case 16:put_pixels16_c(dst, src, stride, height); break;
1211     }
1212 }
1213
1214 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1215     int i,j;
1216     for (i=0; i < height; i++) {
1217       for (j=0; j < width; j++) {
1218         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1219       }
1220       src += stride;
1221       dst += stride;
1222     }
1223 }
1224
1225 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1226     int i,j;
1227     for (i=0; i < height; i++) {
1228       for (j=0; j < width; j++) {
1229         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1230       }
1231       src += stride;
1232       dst += stride;
1233     }
1234 }
1235
1236 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1237     int i,j;
1238     for (i=0; i < height; i++) {
1239       for (j=0; j < width; j++) {
1240         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1241       }
1242       src += stride;
1243       dst += stride;
1244     }
1245 }
1246
1247 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1248     int i,j;
1249     for (i=0; i < height; i++) {
1250       for (j=0; j < width; j++) {
1251         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1252       }
1253       src += stride;
1254       dst += stride;
1255     }
1256 }
1257
1258 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1259     int i,j;
1260     for (i=0; i < height; i++) {
1261       for (j=0; j < width; j++) {
1262         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1263       }
1264       src += stride;
1265       dst += stride;
1266     }
1267 }
1268
1269 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1270     int i,j;
1271     for (i=0; i < height; i++) {
1272       for (j=0; j < width; j++) {
1273         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1274       }
1275       src += stride;
1276       dst += stride;
1277     }
1278 }
1279
1280 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1281     int i,j;
1282     for (i=0; i < height; i++) {
1283       for (j=0; j < width; j++) {
1284         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1285       }
1286       src += stride;
1287       dst += stride;
1288     }
1289 }
1290
1291 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1292     int i,j;
1293     for (i=0; i < height; i++) {
1294       for (j=0; j < width; j++) {
1295         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1296       }
1297       src += stride;
1298       dst += stride;
1299     }
1300 }
1301
1302 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1303     switch(width){
1304     case 2: avg_pixels2_c (dst, src, stride, height); break;
1305     case 4: avg_pixels4_c (dst, src, stride, height); break;
1306     case 8: avg_pixels8_c (dst, src, stride, height); break;
1307     case 16:avg_pixels16_c(dst, src, stride, height); break;
1308     }
1309 }
1310
1311 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1312     int i,j;
1313     for (i=0; i < height; i++) {
1314       for (j=0; j < width; j++) {
1315         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1316       }
1317       src += stride;
1318       dst += stride;
1319     }
1320 }
1321
1322 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1323     int i,j;
1324     for (i=0; i < height; i++) {
1325       for (j=0; j < width; j++) {
1326         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1327       }
1328       src += stride;
1329       dst += stride;
1330     }
1331 }
1332
1333 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1334     int i,j;
1335     for (i=0; i < height; i++) {
1336       for (j=0; j < width; j++) {
1337         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1338       }
1339       src += stride;
1340       dst += stride;
1341     }
1342 }
1343
1344 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1345     int i,j;
1346     for (i=0; i < height; i++) {
1347       for (j=0; j < width; j++) {
1348         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1349       }
1350       src += stride;
1351       dst += stride;
1352     }
1353 }
1354
1355 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1356     int i,j;
1357     for (i=0; i < height; i++) {
1358       for (j=0; j < width; j++) {
1359         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1360       }
1361       src += stride;
1362       dst += stride;
1363     }
1364 }
1365
1366 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367     int i,j;
1368     for (i=0; i < height; i++) {
1369       for (j=0; j < width; j++) {
1370         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1371       }
1372       src += stride;
1373       dst += stride;
1374     }
1375 }
1376
1377 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378     int i,j;
1379     for (i=0; i < height; i++) {
1380       for (j=0; j < width; j++) {
1381         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1382       }
1383       src += stride;
1384       dst += stride;
1385     }
1386 }
1387
1388 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389     int i,j;
1390     for (i=0; i < height; i++) {
1391       for (j=0; j < width; j++) {
1392         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1393       }
1394       src += stride;
1395       dst += stride;
1396     }
1397 }
1398 #if 0
1399 #define TPEL_WIDTH(width)\
1400 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1401     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1402 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1403     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1404 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1405     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1406 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1407     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1408 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1409     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1410 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1411     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1412 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1413     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1414 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1415     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1416 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1417     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1418 #endif
1419
1420 #define H264_CHROMA_MC(OPNAME, OP)\
1421 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1422     const int A=(8-x)*(8-y);\
1423     const int B=(  x)*(8-y);\
1424     const int C=(8-x)*(  y);\
1425     const int D=(  x)*(  y);\
1426     int i;\
1427     \
1428     assert(x<8 && y<8 && x>=0 && y>=0);\
1429 \
1430     for(i=0; i<h; i++)\
1431     {\
1432         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1433         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1434         dst+= stride;\
1435         src+= stride;\
1436     }\
1437 }\
1438 \
1439 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1440     const int A=(8-x)*(8-y);\
1441     const int B=(  x)*(8-y);\
1442     const int C=(8-x)*(  y);\
1443     const int D=(  x)*(  y);\
1444     int i;\
1445     \
1446     assert(x<8 && y<8 && x>=0 && y>=0);\
1447 \
1448     for(i=0; i<h; i++)\
1449     {\
1450         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1451         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1452         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1453         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1454         dst+= stride;\
1455         src+= stride;\
1456     }\
1457 }\
1458 \
1459 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1460     const int A=(8-x)*(8-y);\
1461     const int B=(  x)*(8-y);\
1462     const int C=(8-x)*(  y);\
1463     const int D=(  x)*(  y);\
1464     int i;\
1465     \
1466     assert(x<8 && y<8 && x>=0 && y>=0);\
1467 \
1468     for(i=0; i<h; i++)\
1469     {\
1470         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1471         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1472         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1473         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1474         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1475         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1476         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1477         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1478         dst+= stride;\
1479         src+= stride;\
1480     }\
1481 }
1482
1483 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1484 #define op_put(a, b) a = (((b) + 32)>>6)
1485
1486 H264_CHROMA_MC(put_       , op_put)
1487 H264_CHROMA_MC(avg_       , op_avg)
1488 #undef op_avg
1489 #undef op_put
1490
1491 static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1492 {
1493     int i;
1494     for(i=0; i<h; i++)
1495     {
1496         ST16(dst   , LD16(src   ));
1497         dst+=dstStride;
1498         src+=srcStride;
1499     }
1500 }
1501
1502 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1503 {
1504     int i;
1505     for(i=0; i<h; i++)
1506     {
1507         ST32(dst   , LD32(src   ));
1508         dst+=dstStride;
1509         src+=srcStride;
1510     }
1511 }
1512
1513 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1514 {
1515     int i;
1516     for(i=0; i<h; i++)
1517     {
1518         ST32(dst   , LD32(src   ));
1519         ST32(dst+4 , LD32(src+4 ));
1520         dst+=dstStride;
1521         src+=srcStride;
1522     }
1523 }
1524
1525 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1526 {
1527     int i;
1528     for(i=0; i<h; i++)
1529     {
1530         ST32(dst   , LD32(src   ));
1531         ST32(dst+4 , LD32(src+4 ));
1532         ST32(dst+8 , LD32(src+8 ));
1533         ST32(dst+12, LD32(src+12));
1534         dst+=dstStride;
1535         src+=srcStride;
1536     }
1537 }
1538
1539 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1540 {
1541     int i;
1542     for(i=0; i<h; i++)
1543     {
1544         ST32(dst   , LD32(src   ));
1545         ST32(dst+4 , LD32(src+4 ));
1546         ST32(dst+8 , LD32(src+8 ));
1547         ST32(dst+12, LD32(src+12));
1548         dst[16]= src[16];
1549         dst+=dstStride;
1550         src+=srcStride;
1551     }
1552 }
1553
1554 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1555 {
1556     int i;
1557     for(i=0; i<h; i++)
1558     {
1559         ST32(dst   , LD32(src   ));
1560         ST32(dst+4 , LD32(src+4 ));
1561         dst[8]= src[8];
1562         dst+=dstStride;
1563         src+=srcStride;
1564     }
1565 }
1566
1567
1568 #define QPEL_MC(r, OPNAME, RND, OP) \
1569 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1570     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1571     int i;\
1572     for(i=0; i<h; i++)\
1573     {\
1574         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1575         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1576         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1577         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1578         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1579         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1580         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1581         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1582         dst+=dstStride;\
1583         src+=srcStride;\
1584     }\
1585 }\
1586 \
1587 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1588     const int w=8;\
1589     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1590     int i;\
1591     for(i=0; i<w; i++)\
1592     {\
1593         const int src0= src[0*srcStride];\
1594         const int src1= src[1*srcStride];\
1595         const int src2= src[2*srcStride];\
1596         const int src3= src[3*srcStride];\
1597         const int src4= src[4*srcStride];\
1598         const int src5= src[5*srcStride];\
1599         const int src6= src[6*srcStride];\
1600         const int src7= src[7*srcStride];\
1601         const int src8= src[8*srcStride];\
1602         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1603         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1604         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1605         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1606         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1607         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1608         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1609         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1610         dst++;\
1611         src++;\
1612     }\
1613 }\
1614 \
1615 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1616     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1617     int i;\
1618     \
1619     for(i=0; i<h; i++)\
1620     {\
1621         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1622         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1623         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1624         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1625         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1626         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1627         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1628         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1629         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1630         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1631         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1632         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1633         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1634         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1635         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1636         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1637         dst+=dstStride;\
1638         src+=srcStride;\
1639     }\
1640 }\
1641 \
1642 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1643     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1644     int i;\
1645     const int w=16;\
1646     for(i=0; i<w; i++)\
1647     {\
1648         const int src0= src[0*srcStride];\
1649         const int src1= src[1*srcStride];\
1650         const int src2= src[2*srcStride];\
1651         const int src3= src[3*srcStride];\
1652         const int src4= src[4*srcStride];\
1653         const int src5= src[5*srcStride];\
1654         const int src6= src[6*srcStride];\
1655         const int src7= src[7*srcStride];\
1656         const int src8= src[8*srcStride];\
1657         const int src9= src[9*srcStride];\
1658         const int src10= src[10*srcStride];\
1659         const int src11= src[11*srcStride];\
1660         const int src12= src[12*srcStride];\
1661         const int src13= src[13*srcStride];\
1662         const int src14= src[14*srcStride];\
1663         const int src15= src[15*srcStride];\
1664         const int src16= src[16*srcStride];\
1665         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1666         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1667         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1668         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1669         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1670         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1671         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1672         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1673         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1674         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1675         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1676         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1677         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1678         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1679         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1680         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1681         dst++;\
1682         src++;\
1683     }\
1684 }\
1685 \
1686 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1687     OPNAME ## pixels8_c(dst, src, stride, 8);\
1688 }\
1689 \
1690 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1691     uint8_t half[64];\
1692     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1693     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1694 }\
1695 \
1696 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1697     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1698 }\
1699 \
1700 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1701     uint8_t half[64];\
1702     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1703     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1704 }\
1705 \
1706 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1707     uint8_t full[16*9];\
1708     uint8_t half[64];\
1709     copy_block9(full, src, 16, stride, 9);\
1710     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1711     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1712 }\
1713 \
1714 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1715     uint8_t full[16*9];\
1716     copy_block9(full, src, 16, stride, 9);\
1717     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1718 }\
1719 \
1720 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1721     uint8_t full[16*9];\
1722     uint8_t half[64];\
1723     copy_block9(full, src, 16, stride, 9);\
1724     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1725     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1726 }\
1727 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1728     uint8_t full[16*9];\
1729     uint8_t halfH[72];\
1730     uint8_t halfV[64];\
1731     uint8_t halfHV[64];\
1732     copy_block9(full, src, 16, stride, 9);\
1733     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1734     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1735     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1736     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1737 }\
1738 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1739     uint8_t full[16*9];\
1740     uint8_t halfH[72];\
1741     uint8_t halfHV[64];\
1742     copy_block9(full, src, 16, stride, 9);\
1743     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1744     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1745     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1746     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1747 }\
1748 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1749     uint8_t full[16*9];\
1750     uint8_t halfH[72];\
1751     uint8_t halfV[64];\
1752     uint8_t halfHV[64];\
1753     copy_block9(full, src, 16, stride, 9);\
1754     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1755     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1756     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1757     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1758 }\
1759 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1760     uint8_t full[16*9];\
1761     uint8_t halfH[72];\
1762     uint8_t halfHV[64];\
1763     copy_block9(full, src, 16, stride, 9);\
1764     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1765     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1766     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1767     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1768 }\
1769 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1770     uint8_t full[16*9];\
1771     uint8_t halfH[72];\
1772     uint8_t halfV[64];\
1773     uint8_t halfHV[64];\
1774     copy_block9(full, src, 16, stride, 9);\
1775     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1776     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1777     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1778     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1779 }\
1780 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1781     uint8_t full[16*9];\
1782     uint8_t halfH[72];\
1783     uint8_t halfHV[64];\
1784     copy_block9(full, src, 16, stride, 9);\
1785     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1786     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1787     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1788     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1789 }\
1790 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1791     uint8_t full[16*9];\
1792     uint8_t halfH[72];\
1793     uint8_t halfV[64];\
1794     uint8_t halfHV[64];\
1795     copy_block9(full, src, 16, stride, 9);\
1796     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1797     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1798     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1799     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1800 }\
1801 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1802     uint8_t full[16*9];\
1803     uint8_t halfH[72];\
1804     uint8_t halfHV[64];\
1805     copy_block9(full, src, 16, stride, 9);\
1806     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1807     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1808     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1809     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1810 }\
1811 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1812     uint8_t halfH[72];\
1813     uint8_t halfHV[64];\
1814     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1815     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1816     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1817 }\
1818 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1819     uint8_t halfH[72];\
1820     uint8_t halfHV[64];\
1821     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1822     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1823     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1824 }\
1825 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1826     uint8_t full[16*9];\
1827     uint8_t halfH[72];\
1828     uint8_t halfV[64];\
1829     uint8_t halfHV[64];\
1830     copy_block9(full, src, 16, stride, 9);\
1831     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1832     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1833     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1834     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1835 }\
1836 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1837     uint8_t full[16*9];\
1838     uint8_t halfH[72];\
1839     copy_block9(full, src, 16, stride, 9);\
1840     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1841     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1842     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1843 }\
1844 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1845     uint8_t full[16*9];\
1846     uint8_t halfH[72];\
1847     uint8_t halfV[64];\
1848     uint8_t halfHV[64];\
1849     copy_block9(full, src, 16, stride, 9);\
1850     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1851     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1852     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1853     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1854 }\
1855 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1856     uint8_t full[16*9];\
1857     uint8_t halfH[72];\
1858     copy_block9(full, src, 16, stride, 9);\
1859     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1860     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1861     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1862 }\
1863 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1864     uint8_t halfH[72];\
1865     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1866     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1867 }\
1868 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1869     OPNAME ## pixels16_c(dst, src, stride, 16);\
1870 }\
1871 \
1872 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1873     uint8_t half[256];\
1874     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1875     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1876 }\
1877 \
1878 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1879     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1880 }\
1881 \
1882 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1883     uint8_t half[256];\
1884     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1885     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1886 }\
1887 \
1888 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1889     uint8_t full[24*17];\
1890     uint8_t half[256];\
1891     copy_block17(full, src, 24, stride, 17);\
1892     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1893     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1894 }\
1895 \
1896 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1897     uint8_t full[24*17];\
1898     copy_block17(full, src, 24, stride, 17);\
1899     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1900 }\
1901 \
1902 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1903     uint8_t full[24*17];\
1904     uint8_t half[256];\
1905     copy_block17(full, src, 24, stride, 17);\
1906     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1907     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1908 }\
1909 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1910     uint8_t full[24*17];\
1911     uint8_t halfH[272];\
1912     uint8_t halfV[256];\
1913     uint8_t halfHV[256];\
1914     copy_block17(full, src, 24, stride, 17);\
1915     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1916     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1917     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1918     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1919 }\
1920 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1921     uint8_t full[24*17];\
1922     uint8_t halfH[272];\
1923     uint8_t halfHV[256];\
1924     copy_block17(full, src, 24, stride, 17);\
1925     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1926     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1927     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1928     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1929 }\
1930 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1931     uint8_t full[24*17];\
1932     uint8_t halfH[272];\
1933     uint8_t halfV[256];\
1934     uint8_t halfHV[256];\
1935     copy_block17(full, src, 24, stride, 17);\
1936     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1937     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1938     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1939     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1940 }\
1941 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1942     uint8_t full[24*17];\
1943     uint8_t halfH[272];\
1944     uint8_t halfHV[256];\
1945     copy_block17(full, src, 24, stride, 17);\
1946     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1947     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1948     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1949     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1950 }\
1951 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1952     uint8_t full[24*17];\
1953     uint8_t halfH[272];\
1954     uint8_t halfV[256];\
1955     uint8_t halfHV[256];\
1956     copy_block17(full, src, 24, stride, 17);\
1957     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1958     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1959     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1960     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1961 }\
1962 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1963     uint8_t full[24*17];\
1964     uint8_t halfH[272];\
1965     uint8_t halfHV[256];\
1966     copy_block17(full, src, 24, stride, 17);\
1967     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1968     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1969     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1970     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1971 }\
1972 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1973     uint8_t full[24*17];\
1974     uint8_t halfH[272];\
1975     uint8_t halfV[256];\
1976     uint8_t halfHV[256];\
1977     copy_block17(full, src, 24, stride, 17);\
1978     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1979     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1980     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1981     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1982 }\
1983 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1984     uint8_t full[24*17];\
1985     uint8_t halfH[272];\
1986     uint8_t halfHV[256];\
1987     copy_block17(full, src, 24, stride, 17);\
1988     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1989     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1990     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1991     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1992 }\
1993 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1994     uint8_t halfH[272];\
1995     uint8_t halfHV[256];\
1996     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1997     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1998     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1999 }\
2000 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2001     uint8_t halfH[272];\
2002     uint8_t halfHV[256];\
2003     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2004     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2005     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2006 }\
2007 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2008     uint8_t full[24*17];\
2009     uint8_t halfH[272];\
2010     uint8_t halfV[256];\
2011     uint8_t halfHV[256];\
2012     copy_block17(full, src, 24, stride, 17);\
2013     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2014     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2015     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2016     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2017 }\
2018 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2019     uint8_t full[24*17];\
2020     uint8_t halfH[272];\
2021     copy_block17(full, src, 24, stride, 17);\
2022     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2023     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2024     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2025 }\
2026 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2027     uint8_t full[24*17];\
2028     uint8_t halfH[272];\
2029     uint8_t halfV[256];\
2030     uint8_t halfHV[256];\
2031     copy_block17(full, src, 24, stride, 17);\
2032     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2033     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2034     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2035     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2036 }\
2037 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2038     uint8_t full[24*17];\
2039     uint8_t halfH[272];\
2040     copy_block17(full, src, 24, stride, 17);\
2041     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2042     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2043     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2044 }\
2045 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2046     uint8_t halfH[272];\
2047     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2048     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2049 }
2050
2051 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2052 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2053 #define op_put(a, b) a = cm[((b) + 16)>>5]
2054 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2055
2056 QPEL_MC(0, put_       , _       , op_put)
2057 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2058 QPEL_MC(0, avg_       , _       , op_avg)
2059 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2060 #undef op_avg
2061 #undef op_avg_no_rnd
2062 #undef op_put
2063 #undef op_put_no_rnd
2064
2065 #if 1
2066 #define H264_LOWPASS(OPNAME, OP, OP2) \
2067 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2068     const int h=2;\
2069     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2070     int i;\
2071     for(i=0; i<h; i++)\
2072     {\
2073         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2074         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2075         dst+=dstStride;\
2076         src+=srcStride;\
2077     }\
2078 }\
2079 \
2080 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2081     const int w=2;\
2082     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2083     int i;\
2084     for(i=0; i<w; i++)\
2085     {\
2086         const int srcB= src[-2*srcStride];\
2087         const int srcA= src[-1*srcStride];\
2088         const int src0= src[0 *srcStride];\
2089         const int src1= src[1 *srcStride];\
2090         const int src2= src[2 *srcStride];\
2091         const int src3= src[3 *srcStride];\
2092         const int src4= src[4 *srcStride];\
2093         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2094         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2095         dst++;\
2096         src++;\
2097     }\
2098 }\
2099 \
2100 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2101     const int h=2;\
2102     const int w=2;\
2103     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2104     int i;\
2105     src -= 2*srcStride;\
2106     for(i=0; i<h+5; i++)\
2107     {\
2108         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2109         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2110         tmp+=tmpStride;\
2111         src+=srcStride;\
2112     }\
2113     tmp -= tmpStride*(h+5-2);\
2114     for(i=0; i<w; i++)\
2115     {\
2116         const int tmpB= tmp[-2*tmpStride];\
2117         const int tmpA= tmp[-1*tmpStride];\
2118         const int tmp0= tmp[0 *tmpStride];\
2119         const int tmp1= tmp[1 *tmpStride];\
2120         const int tmp2= tmp[2 *tmpStride];\
2121         const int tmp3= tmp[3 *tmpStride];\
2122         const int tmp4= tmp[4 *tmpStride];\
2123         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2124         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2125         dst++;\
2126         tmp++;\
2127     }\
2128 }\
2129 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2130     const int h=4;\
2131     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2132     int i;\
2133     for(i=0; i<h; i++)\
2134     {\
2135         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2136         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2137         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2138         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2139         dst+=dstStride;\
2140         src+=srcStride;\
2141     }\
2142 }\
2143 \
2144 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2145     const int w=4;\
2146     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2147     int i;\
2148     for(i=0; i<w; i++)\
2149     {\
2150         const int srcB= src[-2*srcStride];\
2151         const int srcA= src[-1*srcStride];\
2152         const int src0= src[0 *srcStride];\
2153         const int src1= src[1 *srcStride];\
2154         const int src2= src[2 *srcStride];\
2155         const int src3= src[3 *srcStride];\
2156         const int src4= src[4 *srcStride];\
2157         const int src5= src[5 *srcStride];\
2158         const int src6= src[6 *srcStride];\
2159         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2160         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2161         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2162         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2163         dst++;\
2164         src++;\
2165     }\
2166 }\
2167 \
2168 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2169     const int h=4;\
2170     const int w=4;\
2171     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2172     int i;\
2173     src -= 2*srcStride;\
2174     for(i=0; i<h+5; i++)\
2175     {\
2176         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2177         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2178         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2179         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2180         tmp+=tmpStride;\
2181         src+=srcStride;\
2182     }\
2183     tmp -= tmpStride*(h+5-2);\
2184     for(i=0; i<w; i++)\
2185     {\
2186         const int tmpB= tmp[-2*tmpStride];\
2187         const int tmpA= tmp[-1*tmpStride];\
2188         const int tmp0= tmp[0 *tmpStride];\
2189         const int tmp1= tmp[1 *tmpStride];\
2190         const int tmp2= tmp[2 *tmpStride];\
2191         const int tmp3= tmp[3 *tmpStride];\
2192         const int tmp4= tmp[4 *tmpStride];\
2193         const int tmp5= tmp[5 *tmpStride];\
2194         const int tmp6= tmp[6 *tmpStride];\
2195         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2196         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2197         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2198         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2199         dst++;\
2200         tmp++;\
2201     }\
2202 }\
2203 \
2204 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205     const int h=8;\
2206     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2207     int i;\
2208     for(i=0; i<h; i++)\
2209     {\
2210         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2211         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2212         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2213         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2214         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2215         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2216         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2217         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2218         dst+=dstStride;\
2219         src+=srcStride;\
2220     }\
2221 }\
2222 \
2223 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2224     const int w=8;\
2225     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2226     int i;\
2227     for(i=0; i<w; i++)\
2228     {\
2229         const int srcB= src[-2*srcStride];\
2230         const int srcA= src[-1*srcStride];\
2231         const int src0= src[0 *srcStride];\
2232         const int src1= src[1 *srcStride];\
2233         const int src2= src[2 *srcStride];\
2234         const int src3= src[3 *srcStride];\
2235         const int src4= src[4 *srcStride];\
2236         const int src5= src[5 *srcStride];\
2237         const int src6= src[6 *srcStride];\
2238         const int src7= src[7 *srcStride];\
2239         const int src8= src[8 *srcStride];\
2240         const int src9= src[9 *srcStride];\
2241         const int src10=src[10*srcStride];\
2242         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2243         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2244         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2245         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2246         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2247         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2248         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2249         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2250         dst++;\
2251         src++;\
2252     }\
2253 }\
2254 \
2255 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2256     const int h=8;\
2257     const int w=8;\
2258     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2259     int i;\
2260     src -= 2*srcStride;\
2261     for(i=0; i<h+5; i++)\
2262     {\
2263         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2264         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2265         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2266         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2267         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2268         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2269         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2270         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2271         tmp+=tmpStride;\
2272         src+=srcStride;\
2273     }\
2274     tmp -= tmpStride*(h+5-2);\
2275     for(i=0; i<w; i++)\
2276     {\
2277         const int tmpB= tmp[-2*tmpStride];\
2278         const int tmpA= tmp[-1*tmpStride];\
2279         const int tmp0= tmp[0 *tmpStride];\
2280         const int tmp1= tmp[1 *tmpStride];\
2281         const int tmp2= tmp[2 *tmpStride];\
2282         const int tmp3= tmp[3 *tmpStride];\
2283         const int tmp4= tmp[4 *tmpStride];\
2284         const int tmp5= tmp[5 *tmpStride];\
2285         const int tmp6= tmp[6 *tmpStride];\
2286         const int tmp7= tmp[7 *tmpStride];\
2287         const int tmp8= tmp[8 *tmpStride];\
2288         const int tmp9= tmp[9 *tmpStride];\
2289         const int tmp10=tmp[10*tmpStride];\
2290         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2291         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2292         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2293         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2294         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2295         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2296         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2297         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2298         dst++;\
2299         tmp++;\
2300     }\
2301 }\
2302 \
2303 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2304     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2305     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2306     src += 8*srcStride;\
2307     dst += 8*dstStride;\
2308     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2309     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2310 }\
2311 \
2312 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2313     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2314     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2315     src += 8*srcStride;\
2316     dst += 8*dstStride;\
2317     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2318     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2319 }\
2320 \
2321 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2322     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2323     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2324     src += 8*srcStride;\
2325     dst += 8*dstStride;\
2326     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2327     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2328 }\
2329
2330 #define H264_MC(OPNAME, SIZE) \
2331 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2332     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2333 }\
2334 \
2335 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2336     uint8_t half[SIZE*SIZE];\
2337     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2338     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2339 }\
2340 \
2341 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2342     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2343 }\
2344 \
2345 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2346     uint8_t half[SIZE*SIZE];\
2347     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2348     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2349 }\
2350 \
2351 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2352     uint8_t full[SIZE*(SIZE+5)];\
2353     uint8_t * const full_mid= full + SIZE*2;\
2354     uint8_t half[SIZE*SIZE];\
2355     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2356     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2357     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2358 }\
2359 \
2360 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2361     uint8_t full[SIZE*(SIZE+5)];\
2362     uint8_t * const full_mid= full + SIZE*2;\
2363     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2364     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2365 }\
2366 \
2367 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2368     uint8_t full[SIZE*(SIZE+5)];\
2369     uint8_t * const full_mid= full + SIZE*2;\
2370     uint8_t half[SIZE*SIZE];\
2371     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2372     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2373     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2374 }\
2375 \
2376 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2377     uint8_t full[SIZE*(SIZE+5)];\
2378     uint8_t * const full_mid= full + SIZE*2;\
2379     uint8_t halfH[SIZE*SIZE];\
2380     uint8_t halfV[SIZE*SIZE];\
2381     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2382     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2383     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2384     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2385 }\
2386 \
2387 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2388     uint8_t full[SIZE*(SIZE+5)];\
2389     uint8_t * const full_mid= full + SIZE*2;\
2390     uint8_t halfH[SIZE*SIZE];\
2391     uint8_t halfV[SIZE*SIZE];\
2392     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2393     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2394     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2395     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2396 }\
2397 \
2398 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2399     uint8_t full[SIZE*(SIZE+5)];\
2400     uint8_t * const full_mid= full + SIZE*2;\
2401     uint8_t halfH[SIZE*SIZE];\
2402     uint8_t halfV[SIZE*SIZE];\
2403     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2404     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2405     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2406     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2407 }\
2408 \
2409 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2410     uint8_t full[SIZE*(SIZE+5)];\
2411     uint8_t * const full_mid= full + SIZE*2;\
2412     uint8_t halfH[SIZE*SIZE];\
2413     uint8_t halfV[SIZE*SIZE];\
2414     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2415     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2416     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2417     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2418 }\
2419 \
2420 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2421     int16_t tmp[SIZE*(SIZE+5)];\
2422     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2423 }\
2424 \
2425 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2426     int16_t tmp[SIZE*(SIZE+5)];\
2427     uint8_t halfH[SIZE*SIZE];\
2428     uint8_t halfHV[SIZE*SIZE];\
2429     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2430     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2431     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2432 }\
2433 \
2434 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2435     int16_t tmp[SIZE*(SIZE+5)];\
2436     uint8_t halfH[SIZE*SIZE];\
2437     uint8_t halfHV[SIZE*SIZE];\
2438     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2439     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2440     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2441 }\
2442 \
2443 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2444     uint8_t full[SIZE*(SIZE+5)];\
2445     uint8_t * const full_mid= full + SIZE*2;\
2446     int16_t tmp[SIZE*(SIZE+5)];\
2447     uint8_t halfV[SIZE*SIZE];\
2448     uint8_t halfHV[SIZE*SIZE];\
2449     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2450     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2451     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2452     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2453 }\
2454 \
2455 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2456     uint8_t full[SIZE*(SIZE+5)];\
2457     uint8_t * const full_mid= full + SIZE*2;\
2458     int16_t tmp[SIZE*(SIZE+5)];\
2459     uint8_t halfV[SIZE*SIZE];\
2460     uint8_t halfHV[SIZE*SIZE];\
2461     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2462     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2463     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2464     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2465 }\
2466
2467 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2468 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2469 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2470 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2471 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2472
2473 H264_LOWPASS(put_       , op_put, op2_put)
2474 H264_LOWPASS(avg_       , op_avg, op2_avg)
2475 H264_MC(put_, 2)
2476 H264_MC(put_, 4)
2477 H264_MC(put_, 8)
2478 H264_MC(put_, 16)
2479 H264_MC(avg_, 4)
2480 H264_MC(avg_, 8)
2481 H264_MC(avg_, 16)
2482
2483 #undef op_avg
2484 #undef op_put
2485 #undef op2_avg
2486 #undef op2_put
2487 #endif
2488
2489 #define op_scale1(x)  block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2490 #define op_scale2(x)  dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2491 #define H264_WEIGHT(W,H) \
2492 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2493     int y; \
2494     offset <<= log2_denom; \
2495     if(log2_denom) offset += 1<<(log2_denom-1); \
2496     for(y=0; y<H; y++, block += stride){ \
2497         op_scale1(0); \
2498         op_scale1(1); \
2499         if(W==2) continue; \
2500         op_scale1(2); \
2501         op_scale1(3); \
2502         if(W==4) continue; \
2503         op_scale1(4); \
2504         op_scale1(5); \
2505         op_scale1(6); \
2506         op_scale1(7); \
2507         if(W==8) continue; \
2508         op_scale1(8); \
2509         op_scale1(9); \
2510         op_scale1(10); \
2511         op_scale1(11); \
2512         op_scale1(12); \
2513         op_scale1(13); \
2514         op_scale1(14); \
2515         op_scale1(15); \
2516     } \
2517 } \
2518 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2519     int y; \
2520     offset = ((offset + 1) | 1) << log2_denom; \
2521     for(y=0; y<H; y++, dst += stride, src += stride){ \
2522         op_scale2(0); \
2523         op_scale2(1); \
2524         if(W==2) continue; \
2525         op_scale2(2); \
2526         op_scale2(3); \
2527         if(W==4) continue; \
2528         op_scale2(4); \
2529         op_scale2(5); \
2530         op_scale2(6); \
2531         op_scale2(7); \
2532         if(W==8) continue; \
2533         op_scale2(8); \
2534         op_scale2(9); \
2535         op_scale2(10); \
2536         op_scale2(11); \
2537         op_scale2(12); \
2538         op_scale2(13); \
2539         op_scale2(14); \
2540         op_scale2(15); \
2541     } \
2542 }
2543
2544 H264_WEIGHT(16,16)
2545 H264_WEIGHT(16,8)
2546 H264_WEIGHT(8,16)
2547 H264_WEIGHT(8,8)
2548 H264_WEIGHT(8,4)
2549 H264_WEIGHT(4,8)
2550 H264_WEIGHT(4,4)
2551 H264_WEIGHT(4,2)
2552 H264_WEIGHT(2,4)
2553 H264_WEIGHT(2,2)
2554
2555 #undef op_scale1
2556 #undef op_scale2
2557 #undef H264_WEIGHT
2558
2559 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2560     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2561     int i;
2562
2563     for(i=0; i<h; i++){
2564         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2565         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2566         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2567         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2568         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2569         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2570         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2571         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2572         dst+=dstStride;
2573         src+=srcStride;
2574     }
2575 }
2576
2577 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2578     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2579     int i;
2580
2581     for(i=0; i<w; i++){
2582         const int src_1= src[ -srcStride];
2583         const int src0 = src[0          ];
2584         const int src1 = src[  srcStride];
2585         const int src2 = src[2*srcStride];
2586         const int src3 = src[3*srcStride];
2587         const int src4 = src[4*srcStride];
2588         const int src5 = src[5*srcStride];
2589         const int src6 = src[6*srcStride];
2590         const int src7 = src[7*srcStride];
2591         const int src8 = src[8*srcStride];
2592         const int src9 = src[9*srcStride];
2593         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2594         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2595         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2596         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2597         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2598         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2599         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2600         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2601         src++;
2602         dst++;
2603     }
2604 }
2605
2606 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2607     put_pixels8_c(dst, src, stride, 8);
2608 }
2609
2610 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2611     uint8_t half[64];
2612     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2613     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2614 }
2615
2616 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2617     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2618 }
2619
2620 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2621     uint8_t half[64];
2622     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2623     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2624 }
2625
2626 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2627     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2628 }
2629
2630 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2631     uint8_t halfH[88];
2632     uint8_t halfV[64];
2633     uint8_t halfHV[64];
2634     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2635     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2636     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2637     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2638 }
2639 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2640     uint8_t halfH[88];
2641     uint8_t halfV[64];
2642     uint8_t halfHV[64];
2643     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2644     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2645     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2646     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2647 }
2648 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2649     uint8_t halfH[88];
2650     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2651     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2652 }
2653
2654 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2655     int x;
2656     const int strength= ff_h263_loop_filter_strength[qscale];
2657
2658     for(x=0; x<8; x++){
2659         int d1, d2, ad1;
2660         int p0= src[x-2*stride];
2661         int p1= src[x-1*stride];
2662         int p2= src[x+0*stride];
2663         int p3= src[x+1*stride];
2664         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2665
2666         if     (d<-2*strength) d1= 0;
2667         else if(d<-  strength) d1=-2*strength - d;
2668         else if(d<   strength) d1= d;
2669         else if(d< 2*strength) d1= 2*strength - d;
2670         else                   d1= 0;
2671
2672         p1 += d1;
2673         p2 -= d1;
2674         if(p1&256) p1= ~(p1>>31);
2675         if(p2&256) p2= ~(p2>>31);
2676
2677         src[x-1*stride] = p1;
2678         src[x+0*stride] = p2;
2679
2680         ad1= ABS(d1)>>1;
2681
2682         d2= clip((p0-p3)/4, -ad1, ad1);
2683
2684         src[x-2*stride] = p0 - d2;
2685         src[x+  stride] = p3 + d2;
2686     }
2687 }
2688
2689 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2690     int y;
2691     const int strength= ff_h263_loop_filter_strength[qscale];
2692
2693     for(y=0; y<8; y++){
2694         int d1, d2, ad1;
2695         int p0= src[y*stride-2];
2696         int p1= src[y*stride-1];
2697         int p2= src[y*stride+0];
2698         int p3= src[y*stride+1];
2699         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2700
2701         if     (d<-2*strength) d1= 0;
2702         else if(d<-  strength) d1=-2*strength - d;
2703         else if(d<   strength) d1= d;
2704         else if(d< 2*strength) d1= 2*strength - d;
2705         else                   d1= 0;
2706
2707         p1 += d1;
2708         p2 -= d1;
2709         if(p1&256) p1= ~(p1>>31);
2710         if(p2&256) p2= ~(p2>>31);
2711
2712         src[y*stride-1] = p1;
2713         src[y*stride+0] = p2;
2714
2715         ad1= ABS(d1)>>1;
2716
2717         d2= clip((p0-p3)/4, -ad1, ad1);
2718
2719         src[y*stride-2] = p0 - d2;
2720         src[y*stride+1] = p3 + d2;
2721     }
2722 }
2723
2724 static void h261_loop_filter_c(uint8_t *src, int stride){
2725     int x,y,xy,yz;
2726     int temp[64];
2727
2728     for(x=0; x<8; x++){
2729         temp[x      ] = 4*src[x           ];
2730         temp[x + 7*8] = 4*src[x + 7*stride];
2731     }
2732     for(y=1; y<7; y++){
2733         for(x=0; x<8; x++){
2734             xy = y * stride + x;
2735             yz = y * 8 + x;
2736             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2737         }
2738     }
2739
2740     for(y=0; y<8; y++){
2741         src[  y*stride] = (temp[  y*8] + 2)>>2;
2742         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2743         for(x=1; x<7; x++){
2744             xy = y * stride + x;
2745             yz = y * 8 + x;
2746             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2747         }
2748     }
2749 }
2750
2751 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2752 {
2753     int i, d;
2754     for( i = 0; i < 4; i++ ) {
2755         if( tc0[i] < 0 ) {
2756             pix += 4*ystride;
2757             continue;
2758         }
2759         for( d = 0; d < 4; d++ ) {
2760             const int p0 = pix[-1*xstride];
2761             const int p1 = pix[-2*xstride];
2762             const int p2 = pix[-3*xstride];
2763             const int q0 = pix[0];
2764             const int q1 = pix[1*xstride];
2765             const int q2 = pix[2*xstride];
2766
2767             if( ABS( p0 - q0 ) < alpha &&
2768                 ABS( p1 - p0 ) < beta &&
2769                 ABS( q1 - q0 ) < beta ) {
2770
2771                 int tc = tc0[i];
2772                 int i_delta;
2773
2774                 if( ABS( p2 - p0 ) < beta ) {
2775                     pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2776                     tc++;
2777                 }
2778                 if( ABS( q2 - q0 ) < beta ) {
2779                     pix[   xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2780                     tc++;
2781                 }
2782
2783                 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2784                 pix[-xstride] = clip_uint8( p0 + i_delta );    /* p0' */
2785                 pix[0]        = clip_uint8( q0 - i_delta );    /* q0' */
2786             }
2787             pix += ystride;
2788         }
2789     }
2790 }
2791 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2792 {
2793     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2794 }
2795 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2796 {
2797     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2798 }
2799
2800 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2801 {
2802     int i, d;
2803     for( i = 0; i < 4; i++ ) {
2804         const int tc = tc0[i];
2805         if( tc <= 0 ) {
2806             pix += 2*ystride;
2807             continue;
2808         }
2809         for( d = 0; d < 2; d++ ) {
2810             const int p0 = pix[-1*xstride];
2811             const int p1 = pix[-2*xstride];
2812             const int q0 = pix[0];
2813             const int q1 = pix[1*xstride];
2814
2815             if( ABS( p0 - q0 ) < alpha &&
2816                 ABS( p1 - p0 ) < beta &&
2817                 ABS( q1 - q0 ) < beta ) {
2818
2819                 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2820
2821                 pix[-xstride] = clip_uint8( p0 + delta );    /* p0' */
2822                 pix[0]        = clip_uint8( q0 - delta );    /* q0' */
2823             }
2824             pix += ystride;
2825         }
2826     }
2827 }
2828 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2829 {
2830     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2831 }
2832 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2833 {
2834     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2835 }
2836
2837 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2838 {
2839     int d;
2840     for( d = 0; d < 8; d++ ) {
2841         const int p0 = pix[-1*xstride];
2842         const int p1 = pix[-2*xstride];
2843         const int q0 = pix[0];
2844         const int q1 = pix[1*xstride];
2845
2846         if( ABS( p0 - q0 ) < alpha &&
2847             ABS( p1 - p0 ) < beta &&
2848             ABS( q1 - q0 ) < beta ) {
2849
2850             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2851             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2852         }
2853         pix += ystride;
2854     }
2855 }
2856 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2857 {
2858     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2859 }
2860 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2861 {
2862     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2863 }
2864
2865 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2866 {
2867     int s, i;
2868
2869     s = 0;
2870     for(i=0;i<h;i++) {
2871         s += abs(pix1[0] - pix2[0]);
2872         s += abs(pix1[1] - pix2[1]);
2873         s += abs(pix1[2] - pix2[2]);
2874         s += abs(pix1[3] - pix2[3]);
2875         s += abs(pix1[4] - pix2[4]);
2876         s += abs(pix1[5] - pix2[5]);
2877         s += abs(pix1[6] - pix2[6]);
2878         s += abs(pix1[7] - pix2[7]);
2879         s += abs(pix1[8] - pix2[8]);
2880         s += abs(pix1[9] - pix2[9]);
2881         s += abs(pix1[10] - pix2[10]);
2882         s += abs(pix1[11] - pix2[11]);
2883         s += abs(pix1[12] - pix2[12]);
2884         s += abs(pix1[13] - pix2[13]);
2885         s += abs(pix1[14] - pix2[14]);
2886         s += abs(pix1[15] - pix2[15]);
2887         pix1 += line_size;
2888         pix2 += line_size;
2889     }
2890     return s;
2891 }
2892
2893 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2894 {
2895     int s, i;
2896
2897     s = 0;
2898     for(i=0;i<h;i++) {
2899         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2900         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2901         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2902         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2903         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2904         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2905         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2906         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2907         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2908         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2909         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2910         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2911         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2912         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2913         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2914         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2915         pix1 += line_size;
2916         pix2 += line_size;
2917     }
2918     return s;
2919 }
2920
2921 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2922 {
2923     int s, i;
2924     uint8_t *pix3 = pix2 + line_size;
2925
2926     s = 0;
2927     for(i=0;i<h;i++) {
2928         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2929         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2930         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2931         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2932         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2933         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2934         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2935         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2936         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2937         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2938         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2939         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2940         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2941         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2942         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2943         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2944         pix1 += line_size;
2945         pix2 += line_size;
2946         pix3 += line_size;
2947     }
2948     return s;
2949 }
2950
2951 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2952 {
2953     int s, i;
2954     uint8_t *pix3 = pix2 + line_size;
2955
2956     s = 0;
2957     for(i=0;i<h;i++) {
2958         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2959         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2960         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2961         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2962         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2963         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2964         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2965         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2966         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2967         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2968         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2969         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2970         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2971         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2972         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2973         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2974         pix1 += line_size;
2975         pix2 += line_size;
2976         pix3 += line_size;
2977     }
2978     return s;
2979 }
2980
2981 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2982 {
2983     int s, i;
2984
2985     s = 0;
2986     for(i=0;i<h;i++) {
2987         s += abs(pix1[0] - pix2[0]);
2988         s += abs(pix1[1] - pix2[1]);
2989         s += abs(pix1[2] - pix2[2]);
2990         s += abs(pix1[3] - pix2[3]);
2991         s += abs(pix1[4] - pix2[4]);
2992         s += abs(pix1[5] - pix2[5]);
2993         s += abs(pix1[6] - pix2[6]);
2994         s += abs(pix1[7] - pix2[7]);
2995         pix1 += line_size;
2996         pix2 += line_size;
2997     }
2998     return s;
2999 }
3000
3001 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3002 {
3003     int s, i;
3004
3005     s = 0;
3006     for(i=0;i<h;i++) {
3007         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3008         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3009         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3010         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3011         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3012         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3013         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3014         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3015         pix1 += line_size;
3016         pix2 += line_size;
3017     }
3018     return s;
3019 }
3020
3021 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3022 {
3023     int s, i;
3024     uint8_t *pix3 = pix2 + line_size;
3025
3026     s = 0;
3027     for(i=0;i<h;i++) {
3028         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3029         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3030         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3031         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3032         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3033         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3034         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3035         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3036         pix1 += line_size;
3037         pix2 += line_size;
3038         pix3 += line_size;
3039     }
3040     return s;
3041 }
3042
3043 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3044 {
3045     int s, i;
3046     uint8_t *pix3 = pix2 + line_size;
3047
3048     s = 0;
3049     for(i=0;i<h;i++) {
3050         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3051         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3052         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3053         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3054         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3055         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3056         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3057         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3058         pix1 += line_size;
3059         pix2 += line_size;
3060         pix3 += line_size;
3061     }
3062     return s;
3063 }
3064
3065 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3066     MpegEncContext *c = v;
3067     int score1=0;
3068     int score2=0;
3069     int x,y;
3070
3071     for(y=0; y<h; y++){
3072         for(x=0; x<16; x++){
3073             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3074         }
3075         if(y+1<h){
3076             for(x=0; x<15; x++){
3077                 score2+= ABS(  s1[x  ] - s1[x  +stride]
3078                              - s1[x+1] + s1[x+1+stride])
3079                         -ABS(  s2[x  ] - s2[x  +stride]
3080                              - s2[x+1] + s2[x+1+stride]);
3081             }
3082         }
3083         s1+= stride;
3084         s2+= stride;
3085     }
3086
3087     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3088     else  return score1 + ABS(score2)*8;
3089 }
3090
3091 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3092     MpegEncContext *c = v;
3093     int score1=0;
3094     int score2=0;
3095     int x,y;
3096
3097     for(y=0; y<h; y++){
3098         for(x=0; x<8; x++){
3099             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3100         }
3101         if(y+1<h){
3102             for(x=0; x<7; x++){
3103                 score2+= ABS(  s1[x  ] - s1[x  +stride]
3104                              - s1[x+1] + s1[x+1+stride])
3105                         -ABS(  s2[x  ] - s2[x  +stride]
3106                              - s2[x+1] + s2[x+1+stride]);
3107             }
3108         }
3109         s1+= stride;
3110         s2+= stride;
3111     }
3112
3113     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3114     else  return score1 + ABS(score2)*8;
3115 }
3116
3117 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3118     int i;
3119     unsigned int sum=0;
3120
3121     for(i=0; i<8*8; i++){
3122         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3123         int w= weight[i];
3124         b>>= RECON_SHIFT;
3125         assert(-512<b && b<512);
3126
3127         sum += (w*b)*(w*b)>>4;
3128     }
3129     return sum>>2;
3130 }
3131
3132 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3133     int i;
3134
3135     for(i=0; i<8*8; i++){
3136         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3137     }
3138 }
3139
3140 /**
3141  * permutes an 8x8 block.
3142  * @param block the block which will be permuted according to the given permutation vector
3143  * @param permutation the permutation vector
3144  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3145  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3146  *                  (inverse) permutated to scantable order!
3147  */
3148 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3149 {
3150     int i;
3151     DCTELEM temp[64];
3152
3153     if(last<=0) return;
3154     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3155
3156     for(i=0; i<=last; i++){
3157         const int j= scantable[i];
3158         temp[j]= block[j];
3159         block[j]=0;
3160     }
3161
3162     for(i=0; i<=last; i++){
3163         const int j= scantable[i];
3164         const int perm_j= permutation[j];
3165         block[perm_j]= temp[j];
3166     }
3167 }
3168
3169 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3170     return 0;
3171 }
3172
3173 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3174     int i;
3175
3176     memset(cmp, 0, sizeof(void*)*5);
3177
3178     for(i=0; i<5; i++){
3179         switch(type&0xFF){
3180         case FF_CMP_SAD:
3181             cmp[i]= c->sad[i];
3182             break;
3183         case FF_CMP_SATD:
3184             cmp[i]= c->hadamard8_diff[i];
3185             break;
3186         case FF_CMP_SSE:
3187             cmp[i]= c->sse[i];
3188             break;
3189         case FF_CMP_DCT:
3190             cmp[i]= c->dct_sad[i];
3191             break;
3192         case FF_CMP_DCT264:
3193             cmp[i]= c->dct264_sad[i];
3194             break;
3195         case FF_CMP_DCTMAX:
3196             cmp[i]= c->dct_max[i];
3197             break;
3198         case FF_CMP_PSNR:
3199             cmp[i]= c->quant_psnr[i];
3200             break;
3201         case FF_CMP_BIT:
3202             cmp[i]= c->bit[i];
3203             break;
3204         case FF_CMP_RD:
3205             cmp[i]= c->rd[i];
3206             break;
3207         case FF_CMP_VSAD:
3208             cmp[i]= c->vsad[i];
3209             break;
3210         case FF_CMP_VSSE:
3211             cmp[i]= c->vsse[i];
3212             break;
3213         case FF_CMP_ZERO:
3214             cmp[i]= zero_cmp;
3215             break;
3216         case FF_CMP_NSSE:
3217             cmp[i]= c->nsse[i];
3218             break;
3219         case FF_CMP_W53:
3220             cmp[i]= c->w53[i];
3221             break;
3222         case FF_CMP_W97:
3223             cmp[i]= c->w97[i];
3224             break;
3225         default:
3226             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3227         }
3228     }
3229 }
3230
3231 /**
3232  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3233  */
3234 static void clear_blocks_c(DCTELEM *blocks)
3235 {
3236     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3237 }
3238
3239 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3240     int i;
3241     for(i=0; i+7<w; i+=8){
3242         dst[i+0] += src[i+0];
3243         dst[i+1] += src[i+1];
3244         dst[i+2] += src[i+2];
3245         dst[i+3] += src[i+3];
3246         dst[i+4] += src[i+4];
3247         dst[i+5] += src[i+5];
3248         dst[i+6] += src[i+6];
3249         dst[i+7] += src[i+7];
3250     }
3251     for(; i<w; i++)
3252         dst[i+0] += src[i+0];
3253 }
3254
3255 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3256     int i;
3257     for(i=0; i+7<w; i+=8){
3258         dst[i+0] = src1[i+0]-src2[i+0];
3259         dst[i+1] = src1[i+1]-src2[i+1];
3260         dst[i+2] = src1[i+2]-src2[i+2];
3261         dst[i+3] = src1[i+3]-src2[i+3];
3262         dst[i+4] = src1[i+4]-src2[i+4];
3263         dst[i+5] = src1[i+5]-src2[i+5];
3264         dst[i+6] = src1[i+6]-src2[i+6];
3265         dst[i+7] = src1[i+7]-src2[i+7];
3266     }
3267     for(; i<w; i++)
3268         dst[i+0] = src1[i+0]-src2[i+0];
3269 }
3270
3271 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3272     int i;
3273     uint8_t l, lt;
3274
3275     l= *left;
3276     lt= *left_top;
3277
3278     for(i=0; i<w; i++){
3279         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3280         lt= src1[i];
3281         l= src2[i];
3282         dst[i]= l - pred;
3283     }
3284
3285     *left= l;
3286     *left_top= lt;
3287 }
3288
3289 #define BUTTERFLY2(o1,o2,i1,i2) \
3290 o1= (i1)+(i2);\
3291 o2= (i1)-(i2);
3292
3293 #define BUTTERFLY1(x,y) \
3294 {\
3295     int a,b;\
3296     a= x;\
3297     b= y;\
3298     x= a+b;\
3299     y= a-b;\
3300 }
3301
3302 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3303
3304 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3305     int i;
3306     int temp[64];
3307     int sum=0;
3308
3309     assert(h==8);
3310
3311     for(i=0; i<8; i++){
3312         //FIXME try pointer walks
3313         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3314         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3315         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3316         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3317
3318         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3319         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3320         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3321         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3322
3323         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3324         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3325         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3326         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3327     }
3328
3329     for(i=0; i<8; i++){
3330         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3331         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3332         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3333         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3334
3335         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3336         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3337         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3338         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3339
3340         sum +=
3341              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3342             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3343             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3344             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3345     }
3346 #if 0
3347 static int maxi=0;
3348 if(sum>maxi){
3349     maxi=sum;
3350     printf("MAX:%d\n", maxi);
3351 }
3352 #endif
3353     return sum;
3354 }
3355
3356 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3357     int i;
3358     int temp[64];
3359     int sum=0;
3360
3361     assert(h==8);
3362
3363     for(i=0; i<8; i++){
3364         //FIXME try pointer walks
3365         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3366         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3367         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3368         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3369
3370         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3371         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3372         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3373         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3374
3375         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3376         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3377         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3378         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3379     }
3380
3381     for(i=0; i<8; i++){
3382         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3383         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3384         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3385         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3386
3387         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3388         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3389         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3390         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3391
3392         sum +=
3393              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3394             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3395             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3396             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3397     }
3398
3399     sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3400
3401     return sum;
3402 }
3403
3404 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3405     MpegEncContext * const s= (MpegEncContext *)c;
3406     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3407     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3408     int sum=0, i;
3409
3410     assert(h==8);
3411
3412     s->dsp.diff_pixels(temp, src1, src2, stride);
3413     s->dsp.fdct(temp);
3414
3415     for(i=0; i<64; i++)
3416         sum+= ABS(temp[i]);
3417
3418     return sum;
3419 }
3420
3421 #ifdef CONFIG_GPL
3422 #define DCT8_1D {\
3423     const int s07 = SRC(0) + SRC(7);\
3424     const int s16 = SRC(1) + SRC(6);\
3425     const int s25 = SRC(2) + SRC(5);\
3426     const int s34 = SRC(3) + SRC(4);\
3427     const int a0 = s07 + s34;\
3428     const int a1 = s16 + s25;\
3429     const int a2 = s07 - s34;\
3430     const int a3 = s16 - s25;\
3431     const int d07 = SRC(0) - SRC(7);\
3432     const int d16 = SRC(1) - SRC(6);\
3433     const int d25 = SRC(2) - SRC(5);\
3434     const int d34 = SRC(3) - SRC(4);\
3435     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3436     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3437     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3438     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3439     DST(0,  a0 + a1     ) ;\
3440     DST(1,  a4 + (a7>>2)) ;\
3441     DST(2,  a2 + (a3>>1)) ;\
3442     DST(3,  a5 + (a6>>2)) ;\
3443     DST(4,  a0 - a1     ) ;\
3444     DST(5,  a6 - (a5>>2)) ;\
3445     DST(6, (a2>>1) - a3 ) ;\
3446     DST(7, (a4>>2) - a7 ) ;\
3447 }
3448
3449 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3450     MpegEncContext * const s= (MpegEncContext *)c;
3451     int16_t dct[8][8];
3452     int i;
3453     int sum=0;
3454
3455     s->dsp.diff_pixels(dct, src1, src2, stride);
3456
3457 #define SRC(x) dct[i][x]
3458 #define DST(x,v) dct[i][x]= v
3459     for( i = 0; i < 8; i++ )
3460         DCT8_1D
3461 #undef SRC
3462 #undef DST
3463
3464 #define SRC(x) dct[x][i]
3465 #define DST(x,v) sum += ABS(v)
3466     for( i = 0; i < 8; i++ )
3467         DCT8_1D
3468 #undef SRC
3469 #undef DST
3470     return sum;
3471 }
3472 #endif
3473
3474 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3475     MpegEncContext * const s= (MpegEncContext *)c;
3476     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3477     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3478     int sum=0, i;
3479
3480     assert(h==8);
3481
3482     s->dsp.diff_pixels(temp, src1, src2, stride);
3483     s->dsp.fdct(temp);
3484
3485     for(i=0; i<64; i++)
3486         sum= FFMAX(sum, ABS(temp[i]));
3487
3488     return sum;
3489 }
3490
3491 void simple_idct(DCTELEM *block); //FIXME
3492
3493 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3494     MpegEncContext * const s= (MpegEncContext *)c;
3495     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3496     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3497     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3498     int sum=0, i;
3499
3500     assert(h==8);
3501     s->mb_intra=0;
3502
3503     s->dsp.diff_pixels(temp, src1, src2, stride);
3504
3505     memcpy(bak, temp, 64*sizeof(DCTELEM));
3506
3507     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3508     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3509     simple_idct(temp); //FIXME
3510
3511     for(i=0; i<64; i++)
3512         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3513
3514     return sum;
3515 }
3516
3517 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3518     MpegEncContext * const s= (MpegEncContext *)c;
3519     const uint8_t *scantable= s->intra_scantable.permutated;
3520     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3521     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3522     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3523     uint8_t * const bak= (uint8_t*)aligned_bak;
3524     int i, last, run, bits, level, distoration, start_i;
3525     const int esc_length= s->ac_esc_length;
3526     uint8_t * length;
3527     uint8_t * last_length;
3528
3529     assert(h==8);
3530
3531     for(i=0; i<8; i++){
3532         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3533         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3534     }
3535
3536     s->dsp.diff_pixels(temp, src1, src2, stride);
3537
3538     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3539
3540     bits=0;
3541
3542     if (s->mb_intra) {
3543         start_i = 1;
3544         length     = s->intra_ac_vlc_length;
3545         last_length= s->intra_ac_vlc_last_length;
3546         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3547     } else {
3548         start_i = 0;
3549         length     = s->inter_ac_vlc_length;
3550         last_length= s->inter_ac_vlc_last_length;
3551     }
3552
3553     if(last>=start_i){
3554         run=0;
3555         for(i=start_i; i<last; i++){
3556             int j= scantable[i];
3557             level= temp[j];
3558
3559             if(level){
3560                 level+=64;
3561                 if((level&(~127)) == 0){
3562                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3563                 }else
3564                     bits+= esc_length;
3565                 run=0;
3566             }else
3567                 run++;
3568         }
3569         i= scantable[last];
3570
3571         level= temp[i] + 64;
3572
3573         assert(level - 64);
3574
3575         if((level&(~127)) == 0){
3576             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3577         }else
3578             bits+= esc_length;
3579
3580     }
3581
3582     if(last>=0){
3583         if(s->mb_intra)
3584             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3585         else
3586             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3587     }
3588
3589     s->dsp.idct_add(bak, stride, temp);
3590
3591     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3592
3593     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3594 }
3595
3596 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3597     MpegEncContext * const s= (MpegEncContext *)c;
3598     const uint8_t *scantable= s->intra_scantable.permutated;
3599     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3600     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3601     int i, last, run, bits, level, start_i;
3602     const int esc_length= s->ac_esc_length;
3603     uint8_t * length;
3604     uint8_t * last_length;
3605
3606     assert(h==8);
3607
3608     s->dsp.diff_pixels(temp, src1, src2, stride);
3609
3610     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3611
3612     bits=0;
3613
3614     if (s->mb_intra) {
3615         start_i = 1;
3616         length     = s->intra_ac_vlc_length;
3617         last_length= s->intra_ac_vlc_last_length;
3618         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3619     } else {
3620         start_i = 0;
3621         length     = s->inter_ac_vlc_length;
3622         last_length= s->inter_ac_vlc_last_length;
3623     }
3624
3625     if(last>=start_i){
3626         run=0;
3627         for(i=start_i; i<last; i++){
3628             int j= scantable[i];
3629             level= temp[j];
3630
3631             if(level){
3632                 level+=64;
3633                 if((level&(~127)) == 0){
3634                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3635                 }else
3636                     bits+= esc_length;
3637                 run=0;
3638             }else
3639                 run++;
3640         }
3641         i= scantable[last];
3642
3643         level= temp[i] + 64;
3644
3645         assert(level - 64);
3646
3647         if((level&(~127)) == 0){
3648             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3649         }else
3650             bits+= esc_length;
3651     }
3652
3653     return bits;
3654 }
3655
3656 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3657     int score=0;
3658     int x,y;
3659
3660     for(y=1; y<h; y++){
3661         for(x=0; x<16; x+=4){
3662             score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride])
3663                    +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3664         }
3665         s+= stride;
3666     }
3667
3668     return score;
3669 }
3670
3671 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3672     int score=0;
3673     int x,y;
3674
3675     for(y=1; y<h; y++){
3676         for(x=0; x<16; x++){
3677             score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3678         }
3679         s1+= stride;
3680         s2+= stride;
3681     }
3682
3683     return score;
3684 }
3685
3686 #define SQ(a) ((a)*(a))
3687 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3688     int score=0;
3689     int x,y;
3690
3691     for(y=1; y<h; y++){
3692         for(x=0; x<16; x+=4){
3693             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3694                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3695         }
3696         s+= stride;
3697     }
3698
3699     return score;
3700 }
3701
3702 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3703     int score=0;
3704     int x,y;
3705
3706     for(y=1; y<h; y++){
3707         for(x=0; x<16; x++){
3708             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3709         }
3710         s1+= stride;
3711         s2+= stride;
3712     }
3713
3714     return score;
3715 }
3716
3717 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3718 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3719 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3720 #ifdef CONFIG_GPL
3721 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3722 #endif
3723 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3724 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3725 WARPER8_16_SQ(rd8x8_c, rd16_c)
3726 WARPER8_16_SQ(bit8x8_c, bit16_c)
3727
3728 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3729  converted */
3730 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3731 {
3732     j_rev_dct (block);
3733     put_pixels_clamped_c(block, dest, line_size);
3734 }
3735 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3736 {
3737     j_rev_dct (block);
3738     add_pixels_clamped_c(block, dest, line_size);
3739 }
3740
3741 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3742 {
3743     j_rev_dct4 (block);
3744     put_pixels_clamped4_c(block, dest, line_size);
3745 }
3746 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3747 {
3748     j_rev_dct4 (block);
3749     add_pixels_clamped4_c(block, dest, line_size);
3750 }
3751
3752 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3753 {
3754     j_rev_dct2 (block);
3755     put_pixels_clamped2_c(block, dest, line_size);
3756 }
3757 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3758 {
3759     j_rev_dct2 (block);
3760     add_pixels_clamped2_c(block, dest, line_size);
3761 }
3762
3763 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3764 {
3765     uint8_t *cm = cropTbl + MAX_NEG_CROP;
3766
3767     dest[0] = cm[(block[0] + 4)>>3];
3768 }
3769 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3770 {
3771     uint8_t *cm = cropTbl + MAX_NEG_CROP;
3772
3773     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3774 }
3775
3776 static void just_return() { return; }
3777
3778 /* init static data */
3779 void dsputil_static_init(void)
3780 {
3781     int i;
3782
3783     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3784     for(i=0;i<MAX_NEG_CROP;i++) {
3785         cropTbl[i] = 0;
3786         cropTbl[i + MAX_NEG_CROP + 256] = 255;
3787     }
3788
3789     for(i=0;i<512;i++) {
3790         squareTbl[i] = (i - 256) * (i - 256);
3791     }
3792
3793     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3794 }
3795
3796
3797 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3798 {
3799     int i;
3800
3801 #ifdef CONFIG_ENCODERS
3802     if(avctx->dct_algo==FF_DCT_FASTINT) {
3803         c->fdct = fdct_ifast;
3804         c->fdct248 = fdct_ifast248;
3805     }
3806     else if(avctx->dct_algo==FF_DCT_FAAN) {
3807         c->fdct = ff_faandct;
3808         c->fdct248 = ff_faandct248;
3809     }
3810     else {
3811         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3812         c->fdct248 = ff_fdct248_islow;
3813     }
3814 #endif //CONFIG_ENCODERS
3815
3816     if(avctx->lowres==1){
3817         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3818             c->idct_put= ff_jref_idct4_put;
3819             c->idct_add= ff_jref_idct4_add;
3820         }else{
3821             c->idct_put= ff_h264_lowres_idct_put_c;
3822             c->idct_add= ff_h264_lowres_idct_add_c;
3823         }
3824         c->idct    = j_rev_dct4;
3825         c->idct_permutation_type= FF_NO_IDCT_PERM;
3826     }else if(avctx->lowres==2){
3827         c->idct_put= ff_jref_idct2_put;
3828         c->idct_add= ff_jref_idct2_add;
3829         c->idct    = j_rev_dct2;
3830         c->idct_permutation_type= FF_NO_IDCT_PERM;
3831     }else if(avctx->lowres==3){
3832         c->idct_put= ff_jref_idct1_put;
3833         c->idct_add= ff_jref_idct1_add;
3834         c->idct    = j_rev_dct1;
3835         c->idct_permutation_type= FF_NO_IDCT_PERM;
3836     }else{
3837         if(avctx->idct_algo==FF_IDCT_INT){
3838             c->idct_put= ff_jref_idct_put;
3839             c->idct_add= ff_jref_idct_add;
3840             c->idct    = j_rev_dct;
3841             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3842         }else if(avctx->idct_algo==FF_IDCT_VP3){
3843             c->idct_put= ff_vp3_idct_put_c;
3844             c->idct_add= ff_vp3_idct_add_c;
3845             c->idct    = ff_vp3_idct_c;
3846             c->idct_permutation_type= FF_NO_IDCT_PERM;
3847         }else{ //accurate/default
3848             c->idct_put= simple_idct_put;
3849             c->idct_add= simple_idct_add;
3850             c->idct    = simple_idct;
3851             c->idct_permutation_type= FF_NO_IDCT_PERM;
3852         }
3853     }
3854
3855     c->h264_idct_add= ff_h264_idct_add_c;
3856     c->h264_idct8_add= ff_h264_idct8_add_c;
3857     c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3858     c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3859
3860     c->get_pixels = get_pixels_c;
3861     c->diff_pixels = diff_pixels_c;
3862     c->put_pixels_clamped = put_pixels_clamped_c;
3863     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3864     c->add_pixels_clamped = add_pixels_clamped_c;
3865     c->add_pixels8 = add_pixels8_c;
3866     c->add_pixels4 = add_pixels4_c;
3867     c->gmc1 = gmc1_c;
3868     c->gmc = ff_gmc_c;
3869     c->clear_blocks = clear_blocks_c;
3870     c->pix_sum = pix_sum_c;
3871     c->pix_norm1 = pix_norm1_c;
3872
3873     /* TODO [0] 16  [1] 8 */
3874     c->pix_abs[0][0] = pix_abs16_c;
3875     c->pix_abs[0][1] = pix_abs16_x2_c;
3876     c->pix_abs[0][2] = pix_abs16_y2_c;
3877     c->pix_abs[0][3] = pix_abs16_xy2_c;
3878     c->pix_abs[1][0] = pix_abs8_c;
3879     c->pix_abs[1][1] = pix_abs8_x2_c;
3880     c->pix_abs[1][2] = pix_abs8_y2_c;
3881     c->pix_abs[1][3] = pix_abs8_xy2_c;
3882
3883 #define dspfunc(PFX, IDX, NUM) \
3884     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3885     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3886     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3887     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3888
3889     dspfunc(put, 0, 16);
3890     dspfunc(put_no_rnd, 0, 16);
3891     dspfunc(put, 1, 8);
3892     dspfunc(put_no_rnd, 1, 8);
3893     dspfunc(put, 2, 4);
3894     dspfunc(put, 3, 2);
3895
3896     dspfunc(avg, 0, 16);
3897     dspfunc(avg_no_rnd, 0, 16);
3898     dspfunc(avg, 1, 8);
3899     dspfunc(avg_no_rnd, 1, 8);
3900     dspfunc(avg, 2, 4);
3901     dspfunc(avg, 3, 2);
3902 #undef dspfunc
3903
3904     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3905     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3906
3907     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3908     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3909     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3910     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3911     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3912     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3913     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3914     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3915     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3916
3917     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3918     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3919     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3920     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3921     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3922     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3923     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3924     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3925     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3926
3927 #define dspfunc(PFX, IDX, NUM) \
3928     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3929     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3930     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3931     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3932     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3933     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3934     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3935     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3936     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3937     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3938     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3939     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3940     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3941     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3942     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3943     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3944
3945     dspfunc(put_qpel, 0, 16);
3946     dspfunc(put_no_rnd_qpel, 0, 16);
3947
3948     dspfunc(avg_qpel, 0, 16);
3949     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3950
3951     dspfunc(put_qpel, 1, 8);
3952     dspfunc(put_no_rnd_qpel, 1, 8);
3953
3954     dspfunc(avg_qpel, 1, 8);
3955     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3956
3957     dspfunc(put_h264_qpel, 0, 16);
3958     dspfunc(put_h264_qpel, 1, 8);
3959     dspfunc(put_h264_qpel, 2, 4);
3960     dspfunc(put_h264_qpel, 3, 2);
3961     dspfunc(avg_h264_qpel, 0, 16);
3962     dspfunc(avg_h264_qpel, 1, 8);
3963     dspfunc(avg_h264_qpel, 2, 4);
3964
3965 #undef dspfunc
3966     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3967     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3968     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3969     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3970     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3971     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3972
3973     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3974     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3975     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3976     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3977     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3978     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3979     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3980     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3981     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3982     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3983     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3984     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3985     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3986     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3987     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3988     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3989     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3990     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3991     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3992     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3993
3994     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3995     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3996     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3997     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3998     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3999     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4000     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4001     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4002
4003 #define SET_CMP_FUNC(name) \
4004     c->name[0]= name ## 16_c;\
4005     c->name[1]= name ## 8x8_c;
4006
4007     SET_CMP_FUNC(hadamard8_diff)
4008     c->hadamard8_diff[4]= hadamard8_intra16_c;
4009     SET_CMP_FUNC(dct_sad)
4010     SET_CMP_FUNC(dct_max)
4011 #ifdef CONFIG_GPL
4012     SET_CMP_FUNC(dct264_sad)
4013 #endif
4014     c->sad[0]= pix_abs16_c;
4015     c->sad[1]= pix_abs8_c;
4016     c->sse[0]= sse16_c;
4017     c->sse[1]= sse8_c;
4018     c->sse[2]= sse4_c;
4019     SET_CMP_FUNC(quant_psnr)
4020     SET_CMP_FUNC(rd)
4021     SET_CMP_FUNC(bit)
4022     c->vsad[0]= vsad16_c;
4023     c->vsad[4]= vsad_intra16_c;
4024     c->vsse[0]= vsse16_c;
4025     c->vsse[4]= vsse_intra16_c;
4026     c->nsse[0]= nsse16_c;
4027     c->nsse[1]= nsse8_c;
4028     c->w53[0]= w53_16_c;
4029     c->w53[1]= w53_8_c;
4030     c->w97[0]= w97_16_c;
4031     c->w97[1]= w97_8_c;
4032
4033     c->add_bytes= add_bytes_c;
4034     c->diff_bytes= diff_bytes_c;
4035     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4036     c->bswap_buf= bswap_buf;
4037
4038     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4039     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4040     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4041     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4042     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4043     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4044
4045     c->h263_h_loop_filter= h263_h_loop_filter_c;
4046     c->h263_v_loop_filter= h263_v_loop_filter_c;
4047
4048     c->h261_loop_filter= h261_loop_filter_c;
4049
4050     c->try_8x8basis= try_8x8basis_c;
4051     c->add_8x8basis= add_8x8basis_c;
4052
4053 #ifdef CONFIG_SNOW_ENCODER
4054     c->vertical_compose97i = ff_snow_vertical_compose97i;
4055     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4056     c->inner_add_yblock = ff_snow_inner_add_yblock;
4057 #endif
4058
4059     c->shrink[0]= ff_img_copy_plane;
4060     c->shrink[1]= ff_shrink22;
4061     c->shrink[2]= ff_shrink44;
4062     c->shrink[3]= ff_shrink88;
4063
4064     c->prefetch= just_return;
4065
4066 #ifdef HAVE_MMX
4067     dsputil_init_mmx(c, avctx);
4068 #endif
4069 #ifdef ARCH_ARMV4L
4070     dsputil_init_armv4l(c, avctx);
4071 #endif
4072 #ifdef HAVE_MLIB
4073     dsputil_init_mlib(c, avctx);
4074 #endif
4075 #ifdef ARCH_SPARC
4076    dsputil_init_vis(c,avctx);
4077 #endif
4078 #ifdef ARCH_ALPHA
4079     dsputil_init_alpha(c, avctx);
4080 #endif
4081 #ifdef ARCH_POWERPC
4082     dsputil_init_ppc(c, avctx);
4083 #endif
4084 #ifdef HAVE_MMI
4085     dsputil_init_mmi(c, avctx);
4086 #endif
4087 #ifdef ARCH_SH4
4088     dsputil_init_sh4(c,avctx);
4089 #endif
4090
4091     switch(c->idct_permutation_type){
4092     case FF_NO_IDCT_PERM:
4093         for(i=0; i<64; i++)
4094             c->idct_permutation[i]= i;
4095         break;
4096     case FF_LIBMPEG2_IDCT_PERM:
4097         for(i=0; i<64; i++)
4098             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4099         break;
4100     case FF_SIMPLE_IDCT_PERM:
4101         for(i=0; i<64; i++)
4102             c->idct_permutation[i]= simple_mmx_permutation[i];
4103         break;
4104     case FF_TRANSPOSE_IDCT_PERM:
4105         for(i=0; i<64; i++)
4106             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4107         break;
4108     case FF_PARTTRANS_IDCT_PERM:
4109         for(i=0; i<64; i++)
4110             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4111         break;
4112     default:
4113         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4114     }
4115 }
4116