git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  *
  20  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  21  */
  22
  23 /**
  24  * @file dsputil.c
  25  * DSP utils
  26  */
  27
  28 #include "avcodec.h"
  29 #include "dsputil.h"
  30 #include "mpegvideo.h"
  31 #include "simple_idct.h"
  32 #include "faandct.h"
  33
  34 /* snow.c */
  35 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  36
  37 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  38 uint32_t squareTbl[512] = {0, };
  39
  40 const uint8_t ff_zigzag_direct[64] = {
  41     0,   1,  8, 16,  9,  2,  3, 10,
  42     17, 24, 32, 25, 18, 11,  4,  5,
  43     12, 19, 26, 33, 40, 48, 41, 34,
  44     27, 20, 13,  6,  7, 14, 21, 28,
  45     35, 42, 49, 56, 57, 50, 43, 36,
  46     29, 22, 15, 23, 30, 37, 44, 51,
  47     58, 59, 52, 45, 38, 31, 39, 46,
  48     53, 60, 61, 54, 47, 55, 62, 63
  49 };
  50
  51 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  52    specification, we interleave the fields */
  53 const uint8_t ff_zigzag248_direct[64] = {
  54      0,  8,  1,  9, 16, 24,  2, 10,
  55     17, 25, 32, 40, 48, 56, 33, 41,
  56     18, 26,  3, 11,  4, 12, 19, 27,
  57     34, 42, 49, 57, 50, 58, 35, 43,
  58     20, 28,  5, 13,  6, 14, 21, 29,
  59     36, 44, 51, 59, 52, 60, 37, 45,
  60     22, 30,  7, 15, 23, 31, 38, 46,
  61     53, 61, 54, 62, 39, 47, 55, 63,
  62 };
  63
  64 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  65 uint16_t __align8 inv_zigzag_direct16[64] = {0, };
  66
  67 const uint8_t ff_alternate_horizontal_scan[64] = {
  68     0,  1,   2,  3,  8,  9, 16, 17,
  69     10, 11,  4,  5,  6,  7, 15, 14,
  70     13, 12, 19, 18, 24, 25, 32, 33,
  71     26, 27, 20, 21, 22, 23, 28, 29,
  72     30, 31, 34, 35, 40, 41, 48, 49,
  73     42, 43, 36, 37, 38, 39, 44, 45,
  74     46, 47, 50, 51, 56, 57, 58, 59,
  75     52, 53, 54, 55, 60, 61, 62, 63,
  76 };
  77
  78 const uint8_t ff_alternate_vertical_scan[64] = {
  79     0,  8,  16, 24,  1,  9,  2, 10,
  80     17, 25, 32, 40, 48, 56, 57, 49,
  81     41, 33, 26, 18,  3, 11,  4, 12,
  82     19, 27, 34, 42, 50, 58, 35, 43,
  83     51, 59, 20, 28,  5, 13,  6, 14,
  84     21, 29, 36, 44, 52, 60, 37, 45,
  85     53, 61, 22, 30,  7, 15, 23, 31,
  86     38, 46, 54, 62, 39, 47, 55, 63,
  87 };
  88
  89 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  90 const uint32_t inverse[256]={
  91          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  92  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
  93  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
  94  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
  95  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
  96  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
  97   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
  98   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
  99   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 100   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 101   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 102   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 103   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 104   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 105   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 106   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 107   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 108   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 109   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 110   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 111   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 112   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 113   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 114   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 115   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 116   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 117   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 118   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 119   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 120   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 121   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 122   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 123 };
 124
 125 /* Input permutation for the simple_idct_mmx */
 126 static const uint8_t simple_mmx_permutation[64]={
 127         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 128         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 129         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 130         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 131         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 132         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 133         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 134         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 135 };
 136
 137 static int pix_sum_c(uint8_t * pix, int line_size)
 138 {
 139     int s, i, j;
 140
 141     s = 0;
 142     for (i = 0; i < 16; i++) {
 143         for (j = 0; j < 16; j += 8) {
 144             s += pix[0];
 145             s += pix[1];
 146             s += pix[2];
 147             s += pix[3];
 148             s += pix[4];
 149             s += pix[5];
 150             s += pix[6];
 151             s += pix[7];
 152             pix += 8;
 153         }
 154         pix += line_size - 16;
 155     }
 156     return s;
 157 }
 158
 159 static int pix_norm1_c(uint8_t * pix, int line_size)
 160 {
 161     int s, i, j;
 162     uint32_t *sq = squareTbl + 256;
 163
 164     s = 0;
 165     for (i = 0; i < 16; i++) {
 166         for (j = 0; j < 16; j += 8) {
 167 #if 0
 168             s += sq[pix[0]];
 169             s += sq[pix[1]];
 170             s += sq[pix[2]];
 171             s += sq[pix[3]];
 172             s += sq[pix[4]];
 173             s += sq[pix[5]];
 174             s += sq[pix[6]];
 175             s += sq[pix[7]];
 176 #else
 177 #if LONG_MAX > 2147483647
 178             register uint64_t x=*(uint64_t*)pix;
 179             s += sq[x&0xff];
 180             s += sq[(x>>8)&0xff];
 181             s += sq[(x>>16)&0xff];
 182             s += sq[(x>>24)&0xff];
 183             s += sq[(x>>32)&0xff];
 184             s += sq[(x>>40)&0xff];
 185             s += sq[(x>>48)&0xff];
 186             s += sq[(x>>56)&0xff];
 187 #else
 188             register uint32_t x=*(uint32_t*)pix;
 189             s += sq[x&0xff];
 190             s += sq[(x>>8)&0xff];
 191             s += sq[(x>>16)&0xff];
 192             s += sq[(x>>24)&0xff];
 193             x=*(uint32_t*)(pix+4);
 194             s += sq[x&0xff];
 195             s += sq[(x>>8)&0xff];
 196             s += sq[(x>>16)&0xff];
 197             s += sq[(x>>24)&0xff];
 198 #endif
 199 #endif
 200             pix += 8;
 201         }
 202         pix += line_size - 16;
 203     }
 204     return s;
 205 }
 206
 207 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 208     int i;
 209
 210     for(i=0; i+8<=w; i+=8){
 211         dst[i+0]= bswap_32(src[i+0]);
 212         dst[i+1]= bswap_32(src[i+1]);
 213         dst[i+2]= bswap_32(src[i+2]);
 214         dst[i+3]= bswap_32(src[i+3]);
 215         dst[i+4]= bswap_32(src[i+4]);
 216         dst[i+5]= bswap_32(src[i+5]);
 217         dst[i+6]= bswap_32(src[i+6]);
 218         dst[i+7]= bswap_32(src[i+7]);
 219     }
 220     for(;i<w; i++){
 221         dst[i+0]= bswap_32(src[i+0]);
 222     }
 223 }
 224
 225 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 226 {
 227     int s, i;
 228     uint32_t *sq = squareTbl + 256;
 229
 230     s = 0;
 231     for (i = 0; i < h; i++) {
 232         s += sq[pix1[0] - pix2[0]];
 233         s += sq[pix1[1] - pix2[1]];
 234         s += sq[pix1[2] - pix2[2]];
 235         s += sq[pix1[3] - pix2[3]];
 236         pix1 += line_size;
 237         pix2 += line_size;
 238     }
 239     return s;
 240 }
 241
 242 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 243 {
 244     int s, i;
 245     uint32_t *sq = squareTbl + 256;
 246
 247     s = 0;
 248     for (i = 0; i < h; i++) {
 249         s += sq[pix1[0] - pix2[0]];
 250         s += sq[pix1[1] - pix2[1]];
 251         s += sq[pix1[2] - pix2[2]];
 252         s += sq[pix1[3] - pix2[3]];
 253         s += sq[pix1[4] - pix2[4]];
 254         s += sq[pix1[5] - pix2[5]];
 255         s += sq[pix1[6] - pix2[6]];
 256         s += sq[pix1[7] - pix2[7]];
 257         pix1 += line_size;
 258         pix2 += line_size;
 259     }
 260     return s;
 261 }
 262
 263 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 264 {
 265     int s, i;
 266     uint32_t *sq = squareTbl + 256;
 267
 268     s = 0;
 269     for (i = 0; i < h; i++) {
 270         s += sq[pix1[ 0] - pix2[ 0]];
 271         s += sq[pix1[ 1] - pix2[ 1]];
 272         s += sq[pix1[ 2] - pix2[ 2]];
 273         s += sq[pix1[ 3] - pix2[ 3]];
 274         s += sq[pix1[ 4] - pix2[ 4]];
 275         s += sq[pix1[ 5] - pix2[ 5]];
 276         s += sq[pix1[ 6] - pix2[ 6]];
 277         s += sq[pix1[ 7] - pix2[ 7]];
 278         s += sq[pix1[ 8] - pix2[ 8]];
 279         s += sq[pix1[ 9] - pix2[ 9]];
 280         s += sq[pix1[10] - pix2[10]];
 281         s += sq[pix1[11] - pix2[11]];
 282         s += sq[pix1[12] - pix2[12]];
 283         s += sq[pix1[13] - pix2[13]];
 284         s += sq[pix1[14] - pix2[14]];
 285         s += sq[pix1[15] - pix2[15]];
 286
 287         pix1 += line_size;
 288         pix2 += line_size;
 289     }
 290     return s;
 291 }
 292
 293
 294 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 295 #ifdef CONFIG_SNOW_ENCODER //idwt is in snow.c
 296     int s, i, j;
 297     const int dec_count= w==8 ? 3 : 4;
 298     int tmp[16*16];
 299 #if 0
 300     int level, ori;
 301     static const int scale[2][2][4][4]={
 302       {
 303         {
 304             //8x8 dec=3
 305             {268, 239, 239, 213},
 306             {  0, 224, 224, 152},
 307             {  0, 135, 135, 110},
 308         },{
 309             //16x16 dec=4
 310             {344, 310, 310, 280},
 311             {  0, 320, 320, 228},
 312             {  0, 175, 175, 136},
 313             {  0, 129, 129, 102},
 314         }
 315       },{
 316         {//FIXME 5/3
 317             //8x8 dec=3
 318             {275, 245, 245, 218},
 319             {  0, 230, 230, 156},
 320             {  0, 138, 138, 113},
 321         },{
 322             //16x16 dec=4
 323             {352, 317, 317, 286},
 324             {  0, 328, 328, 233},
 325             {  0, 180, 180, 140},
 326             {  0, 132, 132, 105},
 327         }
 328       }
 329     };
 330 #endif
 331
 332     for (i = 0; i < h; i++) {
 333         for (j = 0; j < w; j+=4) {
 334             tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 335             tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 336             tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 337             tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 338         }
 339         pix1 += line_size;
 340         pix2 += line_size;
 341     }
 342
 343     ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
 344
 345     s=0;
 346 #if 0
 347     for(level=0; level<dec_count; level++){
 348         for(ori= level ? 1 : 0; ori<4; ori++){
 349             int sx= (ori&1) ? 1<<level: 0;
 350             int stride= 16<<(dec_count-level);
 351             int sy= (ori&2) ? stride>>1 : 0;
 352             int size= 1<<level;
 353
 354             for(i=0; i<size; i++){
 355                 for(j=0; j<size; j++){
 356                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 357                     s += ABS(v);
 358                 }
 359             }
 360         }
 361     }
 362 #endif
 363     for (i = 0; i < h; i++) {
 364         for (j = 0; j < w; j+=4) {
 365             s+= ABS(tmp[16*i+j+0]);
 366             s+= ABS(tmp[16*i+j+1]);
 367             s+= ABS(tmp[16*i+j+2]);
 368             s+= ABS(tmp[16*i+j+3]);
 369         }
 370     }
 371     assert(s>=0);
 372
 373     return s>>2;
 374 #endif
 375 }
 376
 377 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 378     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 379 }
 380
 381 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 382     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 383 }
 384
 385 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 386     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 387 }
 388
 389 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 390     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 391 }
 392
 393 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 394 {
 395     int i;
 396
 397     /* read the pixels */
 398     for(i=0;i<8;i++) {
 399         block[0] = pixels[0];
 400         block[1] = pixels[1];
 401         block[2] = pixels[2];
 402         block[3] = pixels[3];
 403         block[4] = pixels[4];
 404         block[5] = pixels[5];
 405         block[6] = pixels[6];
 406         block[7] = pixels[7];
 407         pixels += line_size;
 408         block += 8;
 409     }
 410 }
 411
 412 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 413                           const uint8_t *s2, int stride){
 414     int i;
 415
 416     /* read the pixels */
 417     for(i=0;i<8;i++) {
 418         block[0] = s1[0] - s2[0];
 419         block[1] = s1[1] - s2[1];
 420         block[2] = s1[2] - s2[2];
 421         block[3] = s1[3] - s2[3];
 422         block[4] = s1[4] - s2[4];
 423         block[5] = s1[5] - s2[5];
 424         block[6] = s1[6] - s2[6];
 425         block[7] = s1[7] - s2[7];
 426         s1 += stride;
 427         s2 += stride;
 428         block += 8;
 429     }
 430 }
 431
 432
 433 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 434                                  int line_size)
 435 {
 436     int i;
 437     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 438
 439     /* read the pixels */
 440     for(i=0;i<8;i++) {
 441         pixels[0] = cm[block[0]];
 442         pixels[1] = cm[block[1]];
 443         pixels[2] = cm[block[2]];
 444         pixels[3] = cm[block[3]];
 445         pixels[4] = cm[block[4]];
 446         pixels[5] = cm[block[5]];
 447         pixels[6] = cm[block[6]];
 448         pixels[7] = cm[block[7]];
 449
 450         pixels += line_size;
 451         block += 8;
 452     }
 453 }
 454
 455 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 456                                  int line_size)
 457 {
 458     int i;
 459     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 460
 461     /* read the pixels */
 462     for(i=0;i<4;i++) {
 463         pixels[0] = cm[block[0]];
 464         pixels[1] = cm[block[1]];
 465         pixels[2] = cm[block[2]];
 466         pixels[3] = cm[block[3]];
 467
 468         pixels += line_size;
 469         block += 8;
 470     }
 471 }
 472
 473 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 474                                  int line_size)
 475 {
 476     int i;
 477     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 478
 479     /* read the pixels */
 480     for(i=0;i<2;i++) {
 481         pixels[0] = cm[block[0]];
 482         pixels[1] = cm[block[1]];
 483
 484         pixels += line_size;
 485         block += 8;
 486     }
 487 }
 488
 489 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 490                                         uint8_t *restrict pixels,
 491                                         int line_size)
 492 {
 493     int i, j;
 494
 495     for (i = 0; i < 8; i++) {
 496         for (j = 0; j < 8; j++) {
 497             if (*block < -128)
 498                 *pixels = 0;
 499             else if (*block > 127)
 500                 *pixels = 255;
 501             else
 502                 *pixels = (uint8_t)(*block + 128);
 503             block++;
 504             pixels++;
 505         }
 506         pixels += (line_size - 8);
 507     }
 508 }
 509
 510 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 511                           int line_size)
 512 {
 513     int i;
 514     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 515
 516     /* read the pixels */
 517     for(i=0;i<8;i++) {
 518         pixels[0] = cm[pixels[0] + block[0]];
 519         pixels[1] = cm[pixels[1] + block[1]];
 520         pixels[2] = cm[pixels[2] + block[2]];
 521         pixels[3] = cm[pixels[3] + block[3]];
 522         pixels[4] = cm[pixels[4] + block[4]];
 523         pixels[5] = cm[pixels[5] + block[5]];
 524         pixels[6] = cm[pixels[6] + block[6]];
 525         pixels[7] = cm[pixels[7] + block[7]];
 526         pixels += line_size;
 527         block += 8;
 528     }
 529 }
 530
 531 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 532                           int line_size)
 533 {
 534     int i;
 535     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 536
 537     /* read the pixels */
 538     for(i=0;i<4;i++) {
 539         pixels[0] = cm[pixels[0] + block[0]];
 540         pixels[1] = cm[pixels[1] + block[1]];
 541         pixels[2] = cm[pixels[2] + block[2]];
 542         pixels[3] = cm[pixels[3] + block[3]];
 543         pixels += line_size;
 544         block += 8;
 545     }
 546 }
 547
 548 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 549                           int line_size)
 550 {
 551     int i;
 552     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 553
 554     /* read the pixels */
 555     for(i=0;i<2;i++) {
 556         pixels[0] = cm[pixels[0] + block[0]];
 557         pixels[1] = cm[pixels[1] + block[1]];
 558         pixels += line_size;
 559         block += 8;
 560     }
 561 }
 562
 563 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 564 {
 565     int i;
 566     for(i=0;i<8;i++) {
 567         pixels[0] += block[0];
 568         pixels[1] += block[1];
 569         pixels[2] += block[2];
 570         pixels[3] += block[3];
 571         pixels[4] += block[4];
 572         pixels[5] += block[5];
 573         pixels[6] += block[6];
 574         pixels[7] += block[7];
 575         pixels += line_size;
 576         block += 8;
 577     }
 578 }
 579
 580 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 581 {
 582     int i;
 583     for(i=0;i<4;i++) {
 584         pixels[0] += block[0];
 585         pixels[1] += block[1];
 586         pixels[2] += block[2];
 587         pixels[3] += block[3];
 588         pixels += line_size;
 589         block += 4;
 590     }
 591 }
 592
 593 #if 0
 594
 595 #define PIXOP2(OPNAME, OP) \
 596 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 597 {\
 598     int i;\
 599     for(i=0; i<h; i++){\
 600         OP(*((uint64_t*)block), LD64(pixels));\
 601         pixels+=line_size;\
 602         block +=line_size;\
 603     }\
 604 }\
 605 \
 606 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 607 {\
 608     int i;\
 609     for(i=0; i<h; i++){\
 610         const uint64_t a= LD64(pixels  );\
 611         const uint64_t b= LD64(pixels+1);\
 612         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 613         pixels+=line_size;\
 614         block +=line_size;\
 615     }\
 616 }\
 617 \
 618 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 619 {\
 620     int i;\
 621     for(i=0; i<h; i++){\
 622         const uint64_t a= LD64(pixels  );\
 623         const uint64_t b= LD64(pixels+1);\
 624         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 625         pixels+=line_size;\
 626         block +=line_size;\
 627     }\
 628 }\
 629 \
 630 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 631 {\
 632     int i;\
 633     for(i=0; i<h; i++){\
 634         const uint64_t a= LD64(pixels          );\
 635         const uint64_t b= LD64(pixels+line_size);\
 636         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 637         pixels+=line_size;\
 638         block +=line_size;\
 639     }\
 640 }\
 641 \
 642 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 643 {\
 644     int i;\
 645     for(i=0; i<h; i++){\
 646         const uint64_t a= LD64(pixels          );\
 647         const uint64_t b= LD64(pixels+line_size);\
 648         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 649         pixels+=line_size;\
 650         block +=line_size;\
 651     }\
 652 }\
 653 \
 654 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 655 {\
 656         int i;\
 657         const uint64_t a= LD64(pixels  );\
 658         const uint64_t b= LD64(pixels+1);\
 659         uint64_t l0=  (a&0x0303030303030303ULL)\
 660                     + (b&0x0303030303030303ULL)\
 661                     + 0x0202020202020202ULL;\
 662         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 663                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 664         uint64_t l1,h1;\
 665 \
 666         pixels+=line_size;\
 667         for(i=0; i<h; i+=2){\
 668             uint64_t a= LD64(pixels  );\
 669             uint64_t b= LD64(pixels+1);\
 670             l1=  (a&0x0303030303030303ULL)\
 671                + (b&0x0303030303030303ULL);\
 672             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 673               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 674             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 675             pixels+=line_size;\
 676             block +=line_size;\
 677             a= LD64(pixels  );\
 678             b= LD64(pixels+1);\
 679             l0=  (a&0x0303030303030303ULL)\
 680                + (b&0x0303030303030303ULL)\
 681                + 0x0202020202020202ULL;\
 682             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 683               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 684             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 685             pixels+=line_size;\
 686             block +=line_size;\
 687         }\
 688 }\
 689 \
 690 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 691 {\
 692         int i;\
 693         const uint64_t a= LD64(pixels  );\
 694         const uint64_t b= LD64(pixels+1);\
 695         uint64_t l0=  (a&0x0303030303030303ULL)\
 696                     + (b&0x0303030303030303ULL)\
 697                     + 0x0101010101010101ULL;\
 698         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 699                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 700         uint64_t l1,h1;\
 701 \
 702         pixels+=line_size;\
 703         for(i=0; i<h; i+=2){\
 704             uint64_t a= LD64(pixels  );\
 705             uint64_t b= LD64(pixels+1);\
 706             l1=  (a&0x0303030303030303ULL)\
 707                + (b&0x0303030303030303ULL);\
 708             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 709               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 710             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 711             pixels+=line_size;\
 712             block +=line_size;\
 713             a= LD64(pixels  );\
 714             b= LD64(pixels+1);\
 715             l0=  (a&0x0303030303030303ULL)\
 716                + (b&0x0303030303030303ULL)\
 717                + 0x0101010101010101ULL;\
 718             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 719               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 720             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 721             pixels+=line_size;\
 722             block +=line_size;\
 723         }\
 724 }\
 725 \
 726 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 727 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 728 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 729 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 730 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 731 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 732 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 733
 734 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 735 #else // 64 bit variant
 736
 737 #define PIXOP2(OPNAME, OP) \
 738 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 739     int i;\
 740     for(i=0; i<h; i++){\
 741         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
 742         pixels+=line_size;\
 743         block +=line_size;\
 744     }\
 745 }\
 746 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 747     int i;\
 748     for(i=0; i<h; i++){\
 749         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 750         pixels+=line_size;\
 751         block +=line_size;\
 752     }\
 753 }\
 754 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 755     int i;\
 756     for(i=0; i<h; i++){\
 757         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 758         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 759         pixels+=line_size;\
 760         block +=line_size;\
 761     }\
 762 }\
 763 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 764     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 765 }\
 766 \
 767 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 768                                                 int src_stride1, int src_stride2, int h){\
 769     int i;\
 770     for(i=0; i<h; i++){\
 771         uint32_t a,b;\
 772         a= LD32(&src1[i*src_stride1  ]);\
 773         b= LD32(&src2[i*src_stride2  ]);\
 774         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 775         a= LD32(&src1[i*src_stride1+4]);\
 776         b= LD32(&src2[i*src_stride2+4]);\
 777         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 778     }\
 779 }\
 780 \
 781 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 782                                                 int src_stride1, int src_stride2, int h){\
 783     int i;\
 784     for(i=0; i<h; i++){\
 785         uint32_t a,b;\
 786         a= LD32(&src1[i*src_stride1  ]);\
 787         b= LD32(&src2[i*src_stride2  ]);\
 788         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 789         a= LD32(&src1[i*src_stride1+4]);\
 790         b= LD32(&src2[i*src_stride2+4]);\
 791         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 792     }\
 793 }\
 794 \
 795 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 796                                                 int src_stride1, int src_stride2, int h){\
 797     int i;\
 798     for(i=0; i<h; i++){\
 799         uint32_t a,b;\
 800         a= LD32(&src1[i*src_stride1  ]);\
 801         b= LD32(&src2[i*src_stride2  ]);\
 802         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 803     }\
 804 }\
 805 \
 806 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 807                                                 int src_stride1, int src_stride2, int h){\
 808     int i;\
 809     for(i=0; i<h; i++){\
 810         uint32_t a,b;\
 811         a= LD16(&src1[i*src_stride1  ]);\
 812         b= LD16(&src2[i*src_stride2  ]);\
 813         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 814     }\
 815 }\
 816 \
 817 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 818                                                 int src_stride1, int src_stride2, int h){\
 819     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 820     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 821 }\
 822 \
 823 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 824                                                 int src_stride1, int src_stride2, int h){\
 825     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 826     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 827 }\
 828 \
 829 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 830     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 831 }\
 832 \
 833 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 834     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 835 }\
 836 \
 837 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 838     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 839 }\
 840 \
 841 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 842     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 843 }\
 844 \
 845 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 846                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 847     int i;\
 848     for(i=0; i<h; i++){\
 849         uint32_t a, b, c, d, l0, l1, h0, h1;\
 850         a= LD32(&src1[i*src_stride1]);\
 851         b= LD32(&src2[i*src_stride2]);\
 852         c= LD32(&src3[i*src_stride3]);\
 853         d= LD32(&src4[i*src_stride4]);\
 854         l0=  (a&0x03030303UL)\
 855            + (b&0x03030303UL)\
 856            + 0x02020202UL;\
 857         h0= ((a&0xFCFCFCFCUL)>>2)\
 858           + ((b&0xFCFCFCFCUL)>>2);\
 859         l1=  (c&0x03030303UL)\
 860            + (d&0x03030303UL);\
 861         h1= ((c&0xFCFCFCFCUL)>>2)\
 862           + ((d&0xFCFCFCFCUL)>>2);\
 863         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 864         a= LD32(&src1[i*src_stride1+4]);\
 865         b= LD32(&src2[i*src_stride2+4]);\
 866         c= LD32(&src3[i*src_stride3+4]);\
 867         d= LD32(&src4[i*src_stride4+4]);\
 868         l0=  (a&0x03030303UL)\
 869            + (b&0x03030303UL)\
 870            + 0x02020202UL;\
 871         h0= ((a&0xFCFCFCFCUL)>>2)\
 872           + ((b&0xFCFCFCFCUL)>>2);\
 873         l1=  (c&0x03030303UL)\
 874            + (d&0x03030303UL);\
 875         h1= ((c&0xFCFCFCFCUL)>>2)\
 876           + ((d&0xFCFCFCFCUL)>>2);\
 877         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 878     }\
 879 }\
 880 \
 881 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 882     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 883 }\
 884 \
 885 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 886     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 887 }\
 888 \
 889 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 890     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 891 }\
 892 \
 893 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 894     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 895 }\
 896 \
 897 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 898                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 899     int i;\
 900     for(i=0; i<h; i++){\
 901         uint32_t a, b, c, d, l0, l1, h0, h1;\
 902         a= LD32(&src1[i*src_stride1]);\
 903         b= LD32(&src2[i*src_stride2]);\
 904         c= LD32(&src3[i*src_stride3]);\
 905         d= LD32(&src4[i*src_stride4]);\
 906         l0=  (a&0x03030303UL)\
 907            + (b&0x03030303UL)\
 908            + 0x01010101UL;\
 909         h0= ((a&0xFCFCFCFCUL)>>2)\
 910           + ((b&0xFCFCFCFCUL)>>2);\
 911         l1=  (c&0x03030303UL)\
 912            + (d&0x03030303UL);\
 913         h1= ((c&0xFCFCFCFCUL)>>2)\
 914           + ((d&0xFCFCFCFCUL)>>2);\
 915         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 916         a= LD32(&src1[i*src_stride1+4]);\
 917         b= LD32(&src2[i*src_stride2+4]);\
 918         c= LD32(&src3[i*src_stride3+4]);\
 919         d= LD32(&src4[i*src_stride4+4]);\
 920         l0=  (a&0x03030303UL)\
 921            + (b&0x03030303UL)\
 922            + 0x01010101UL;\
 923         h0= ((a&0xFCFCFCFCUL)>>2)\
 924           + ((b&0xFCFCFCFCUL)>>2);\
 925         l1=  (c&0x03030303UL)\
 926            + (d&0x03030303UL);\
 927         h1= ((c&0xFCFCFCFCUL)>>2)\
 928           + ((d&0xFCFCFCFCUL)>>2);\
 929         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 930     }\
 931 }\
 932 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 933                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 934     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 935     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 936 }\
 937 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 938                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 939     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 940     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 941 }\
 942 \
 943 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 944 {\
 945         int i, a0, b0, a1, b1;\
 946         a0= pixels[0];\
 947         b0= pixels[1] + 2;\
 948         a0 += b0;\
 949         b0 += pixels[2];\
 950 \
 951         pixels+=line_size;\
 952         for(i=0; i<h; i+=2){\
 953             a1= pixels[0];\
 954             b1= pixels[1];\
 955             a1 += b1;\
 956             b1 += pixels[2];\
 957 \
 958             block[0]= (a1+a0)>>2; /* FIXME non put */\
 959             block[1]= (b1+b0)>>2;\
 960 \
 961             pixels+=line_size;\
 962             block +=line_size;\
 963 \
 964             a0= pixels[0];\
 965             b0= pixels[1] + 2;\
 966             a0 += b0;\
 967             b0 += pixels[2];\
 968 \
 969             block[0]= (a1+a0)>>2;\
 970             block[1]= (b1+b0)>>2;\
 971             pixels+=line_size;\
 972             block +=line_size;\
 973         }\
 974 }\
 975 \
 976 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 977 {\
 978         int i;\
 979         const uint32_t a= LD32(pixels  );\
 980         const uint32_t b= LD32(pixels+1);\
 981         uint32_t l0=  (a&0x03030303UL)\
 982                     + (b&0x03030303UL)\
 983                     + 0x02020202UL;\
 984         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 985                    + ((b&0xFCFCFCFCUL)>>2);\
 986         uint32_t l1,h1;\
 987 \
 988         pixels+=line_size;\
 989         for(i=0; i<h; i+=2){\
 990             uint32_t a= LD32(pixels  );\
 991             uint32_t b= LD32(pixels+1);\
 992             l1=  (a&0x03030303UL)\
 993                + (b&0x03030303UL);\
 994             h1= ((a&0xFCFCFCFCUL)>>2)\
 995               + ((b&0xFCFCFCFCUL)>>2);\
 996             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 997             pixels+=line_size;\
 998             block +=line_size;\
 999             a= LD32(pixels  );\
1000             b= LD32(pixels+1);\
1001             l0=  (a&0x03030303UL)\
1002                + (b&0x03030303UL)\
1003                + 0x02020202UL;\
1004             h0= ((a&0xFCFCFCFCUL)>>2)\
1005               + ((b&0xFCFCFCFCUL)>>2);\
1006             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1007             pixels+=line_size;\
1008             block +=line_size;\
1009         }\
1010 }\
1011 \
1012 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1013 {\
1014     int j;\
1015     for(j=0; j<2; j++){\
1016         int i;\
1017         const uint32_t a= LD32(pixels  );\
1018         const uint32_t b= LD32(pixels+1);\
1019         uint32_t l0=  (a&0x03030303UL)\
1020                     + (b&0x03030303UL)\
1021                     + 0x02020202UL;\
1022         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1023                    + ((b&0xFCFCFCFCUL)>>2);\
1024         uint32_t l1,h1;\
1025 \
1026         pixels+=line_size;\
1027         for(i=0; i<h; i+=2){\
1028             uint32_t a= LD32(pixels  );\
1029             uint32_t b= LD32(pixels+1);\
1030             l1=  (a&0x03030303UL)\
1031                + (b&0x03030303UL);\
1032             h1= ((a&0xFCFCFCFCUL)>>2)\
1033               + ((b&0xFCFCFCFCUL)>>2);\
1034             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1035             pixels+=line_size;\
1036             block +=line_size;\
1037             a= LD32(pixels  );\
1038             b= LD32(pixels+1);\
1039             l0=  (a&0x03030303UL)\
1040                + (b&0x03030303UL)\
1041                + 0x02020202UL;\
1042             h0= ((a&0xFCFCFCFCUL)>>2)\
1043               + ((b&0xFCFCFCFCUL)>>2);\
1044             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1045             pixels+=line_size;\
1046             block +=line_size;\
1047         }\
1048         pixels+=4-line_size*(h+1);\
1049         block +=4-line_size*h;\
1050     }\
1051 }\
1052 \
1053 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1054 {\
1055     int j;\
1056     for(j=0; j<2; j++){\
1057         int i;\
1058         const uint32_t a= LD32(pixels  );\
1059         const uint32_t b= LD32(pixels+1);\
1060         uint32_t l0=  (a&0x03030303UL)\
1061                     + (b&0x03030303UL)\
1062                     + 0x01010101UL;\
1063         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1064                    + ((b&0xFCFCFCFCUL)>>2);\
1065         uint32_t l1,h1;\
1066 \
1067         pixels+=line_size;\
1068         for(i=0; i<h; i+=2){\
1069             uint32_t a= LD32(pixels  );\
1070             uint32_t b= LD32(pixels+1);\
1071             l1=  (a&0x03030303UL)\
1072                + (b&0x03030303UL);\
1073             h1= ((a&0xFCFCFCFCUL)>>2)\
1074               + ((b&0xFCFCFCFCUL)>>2);\
1075             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1076             pixels+=line_size;\
1077             block +=line_size;\
1078             a= LD32(pixels  );\
1079             b= LD32(pixels+1);\
1080             l0=  (a&0x03030303UL)\
1081                + (b&0x03030303UL)\
1082                + 0x01010101UL;\
1083             h0= ((a&0xFCFCFCFCUL)>>2)\
1084               + ((b&0xFCFCFCFCUL)>>2);\
1085             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1086             pixels+=line_size;\
1087             block +=line_size;\
1088         }\
1089         pixels+=4-line_size*(h+1);\
1090         block +=4-line_size*h;\
1091     }\
1092 }\
1093 \
1094 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1095 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1096 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1097 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1098 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1099 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1100 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1101 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1102
1103 #define op_avg(a, b) a = rnd_avg32(a, b)
1104 #endif
1105 #define op_put(a, b) a = b
1106
1107 PIXOP2(avg, op_avg)
1108 PIXOP2(put, op_put)
1109 #undef op_avg
1110 #undef op_put
1111
1112 #define avg2(a,b) ((a+b+1)>>1)
1113 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1114
1115 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1116     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1117 }
1118
1119 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1120     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1121 }
1122
1123 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1124 {
1125     const int A=(16-x16)*(16-y16);
1126     const int B=(   x16)*(16-y16);
1127     const int C=(16-x16)*(   y16);
1128     const int D=(   x16)*(   y16);
1129     int i;
1130
1131     for(i=0; i<h; i++)
1132     {
1133         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1134         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1135         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1136         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1137         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1138         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1139         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1140         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1141         dst+= stride;
1142         src+= stride;
1143     }
1144 }
1145
1146 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1147                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1148 {
1149     int y, vx, vy;
1150     const int s= 1<<shift;
1151
1152     width--;
1153     height--;
1154
1155     for(y=0; y<h; y++){
1156         int x;
1157
1158         vx= ox;
1159         vy= oy;
1160         for(x=0; x<8; x++){ //XXX FIXME optimize
1161             int src_x, src_y, frac_x, frac_y, index;
1162
1163             src_x= vx>>16;
1164             src_y= vy>>16;
1165             frac_x= src_x&(s-1);
1166             frac_y= src_y&(s-1);
1167             src_x>>=shift;
1168             src_y>>=shift;
1169
1170             if((unsigned)src_x < width){
1171                 if((unsigned)src_y < height){
1172                     index= src_x + src_y*stride;
1173                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1174                                            + src[index       +1]*   frac_x )*(s-frac_y)
1175                                         + (  src[index+stride  ]*(s-frac_x)
1176                                            + src[index+stride+1]*   frac_x )*   frac_y
1177                                         + r)>>(shift*2);
1178                 }else{
1179                     index= src_x + clip(src_y, 0, height)*stride;
1180                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1181                                           + src[index       +1]*   frac_x )*s
1182                                         + r)>>(shift*2);
1183                 }
1184             }else{
1185                 if((unsigned)src_y < height){
1186                     index= clip(src_x, 0, width) + src_y*stride;
1187                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1188                                            + src[index+stride  ]*   frac_y )*s
1189                                         + r)>>(shift*2);
1190                 }else{
1191                     index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1192                     dst[y*stride + x]=    src[index         ];
1193                 }
1194             }
1195
1196             vx+= dxx;
1197             vy+= dyx;
1198         }
1199         ox += dxy;
1200         oy += dyy;
1201     }
1202 }
1203
1204 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1205     switch(width){
1206     case 2: put_pixels2_c (dst, src, stride, height); break;
1207     case 4: put_pixels4_c (dst, src, stride, height); break;
1208     case 8: put_pixels8_c (dst, src, stride, height); break;
1209     case 16:put_pixels16_c(dst, src, stride, height); break;
1210     }
1211 }
1212
1213 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1214     int i,j;
1215     for (i=0; i < height; i++) {
1216       for (j=0; j < width; j++) {
1217         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1218       }
1219       src += stride;
1220       dst += stride;
1221     }
1222 }
1223
1224 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1225     int i,j;
1226     for (i=0; i < height; i++) {
1227       for (j=0; j < width; j++) {
1228         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1229       }
1230       src += stride;
1231       dst += stride;
1232     }
1233 }
1234
1235 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1236     int i,j;
1237     for (i=0; i < height; i++) {
1238       for (j=0; j < width; j++) {
1239         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1240       }
1241       src += stride;
1242       dst += stride;
1243     }
1244 }
1245
1246 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1247     int i,j;
1248     for (i=0; i < height; i++) {
1249       for (j=0; j < width; j++) {
1250         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1251       }
1252       src += stride;
1253       dst += stride;
1254     }
1255 }
1256
1257 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1258     int i,j;
1259     for (i=0; i < height; i++) {
1260       for (j=0; j < width; j++) {
1261         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1262       }
1263       src += stride;
1264       dst += stride;
1265     }
1266 }
1267
1268 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1269     int i,j;
1270     for (i=0; i < height; i++) {
1271       for (j=0; j < width; j++) {
1272         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1273       }
1274       src += stride;
1275       dst += stride;
1276     }
1277 }
1278
1279 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280     int i,j;
1281     for (i=0; i < height; i++) {
1282       for (j=0; j < width; j++) {
1283         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1284       }
1285       src += stride;
1286       dst += stride;
1287     }
1288 }
1289
1290 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291     int i,j;
1292     for (i=0; i < height; i++) {
1293       for (j=0; j < width; j++) {
1294         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1295       }
1296       src += stride;
1297       dst += stride;
1298     }
1299 }
1300
1301 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1302     switch(width){
1303     case 2: avg_pixels2_c (dst, src, stride, height); break;
1304     case 4: avg_pixels4_c (dst, src, stride, height); break;
1305     case 8: avg_pixels8_c (dst, src, stride, height); break;
1306     case 16:avg_pixels16_c(dst, src, stride, height); break;
1307     }
1308 }
1309
1310 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1311     int i,j;
1312     for (i=0; i < height; i++) {
1313       for (j=0; j < width; j++) {
1314         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1315       }
1316       src += stride;
1317       dst += stride;
1318     }
1319 }
1320
1321 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1322     int i,j;
1323     for (i=0; i < height; i++) {
1324       for (j=0; j < width; j++) {
1325         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1326       }
1327       src += stride;
1328       dst += stride;
1329     }
1330 }
1331
1332 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1333     int i,j;
1334     for (i=0; i < height; i++) {
1335       for (j=0; j < width; j++) {
1336         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1337       }
1338       src += stride;
1339       dst += stride;
1340     }
1341 }
1342
1343 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1344     int i,j;
1345     for (i=0; i < height; i++) {
1346       for (j=0; j < width; j++) {
1347         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1348       }
1349       src += stride;
1350       dst += stride;
1351     }
1352 }
1353
1354 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355     int i,j;
1356     for (i=0; i < height; i++) {
1357       for (j=0; j < width; j++) {
1358         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1359       }
1360       src += stride;
1361       dst += stride;
1362     }
1363 }
1364
1365 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1366     int i,j;
1367     for (i=0; i < height; i++) {
1368       for (j=0; j < width; j++) {
1369         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1370       }
1371       src += stride;
1372       dst += stride;
1373     }
1374 }
1375
1376 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377     int i,j;
1378     for (i=0; i < height; i++) {
1379       for (j=0; j < width; j++) {
1380         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1381       }
1382       src += stride;
1383       dst += stride;
1384     }
1385 }
1386
1387 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1388     int i,j;
1389     for (i=0; i < height; i++) {
1390       for (j=0; j < width; j++) {
1391         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1392       }
1393       src += stride;
1394       dst += stride;
1395     }
1396 }
1397 #if 0
1398 #define TPEL_WIDTH(width)\
1399 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1400     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1401 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1402     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1403 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1404     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1405 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1406     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1407 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1408     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1409 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1411 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1413 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1417 #endif
1418
1419 #define H264_CHROMA_MC(OPNAME, OP)\
1420 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1421     const int A=(8-x)*(8-y);\
1422     const int B=(  x)*(8-y);\
1423     const int C=(8-x)*(  y);\
1424     const int D=(  x)*(  y);\
1425     int i;\
1426     \
1427     assert(x<8 && y<8 && x>=0 && y>=0);\
1428 \
1429     for(i=0; i<h; i++)\
1430     {\
1431         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1432         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1433         dst+= stride;\
1434         src+= stride;\
1435     }\
1436 }\
1437 \
1438 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1439     const int A=(8-x)*(8-y);\
1440     const int B=(  x)*(8-y);\
1441     const int C=(8-x)*(  y);\
1442     const int D=(  x)*(  y);\
1443     int i;\
1444     \
1445     assert(x<8 && y<8 && x>=0 && y>=0);\
1446 \
1447     for(i=0; i<h; i++)\
1448     {\
1449         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1450         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1451         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1452         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1453         dst+= stride;\
1454         src+= stride;\
1455     }\
1456 }\
1457 \
1458 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1459     const int A=(8-x)*(8-y);\
1460     const int B=(  x)*(8-y);\
1461     const int C=(8-x)*(  y);\
1462     const int D=(  x)*(  y);\
1463     int i;\
1464     \
1465     assert(x<8 && y<8 && x>=0 && y>=0);\
1466 \
1467     for(i=0; i<h; i++)\
1468     {\
1469         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1470         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1471         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1472         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1473         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1474         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1475         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1476         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1477         dst+= stride;\
1478         src+= stride;\
1479     }\
1480 }
1481
1482 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1483 #define op_put(a, b) a = (((b) + 32)>>6)
1484
1485 H264_CHROMA_MC(put_       , op_put)
1486 H264_CHROMA_MC(avg_       , op_avg)
1487 #undef op_avg
1488 #undef op_put
1489
1490 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1491 {
1492     int i;
1493     for(i=0; i<h; i++)
1494     {
1495         ST32(dst   , LD32(src   ));
1496         dst+=dstStride;
1497         src+=srcStride;
1498     }
1499 }
1500
1501 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1502 {
1503     int i;
1504     for(i=0; i<h; i++)
1505     {
1506         ST32(dst   , LD32(src   ));
1507         ST32(dst+4 , LD32(src+4 ));
1508         dst+=dstStride;
1509         src+=srcStride;
1510     }
1511 }
1512
1513 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1514 {
1515     int i;
1516     for(i=0; i<h; i++)
1517     {
1518         ST32(dst   , LD32(src   ));
1519         ST32(dst+4 , LD32(src+4 ));
1520         ST32(dst+8 , LD32(src+8 ));
1521         ST32(dst+12, LD32(src+12));
1522         dst+=dstStride;
1523         src+=srcStride;
1524     }
1525 }
1526
1527 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1528 {
1529     int i;
1530     for(i=0; i<h; i++)
1531     {
1532         ST32(dst   , LD32(src   ));
1533         ST32(dst+4 , LD32(src+4 ));
1534         ST32(dst+8 , LD32(src+8 ));
1535         ST32(dst+12, LD32(src+12));
1536         dst[16]= src[16];
1537         dst+=dstStride;
1538         src+=srcStride;
1539     }
1540 }
1541
1542 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1543 {
1544     int i;
1545     for(i=0; i<h; i++)
1546     {
1547         ST32(dst   , LD32(src   ));
1548         ST32(dst+4 , LD32(src+4 ));
1549         dst[8]= src[8];
1550         dst+=dstStride;
1551         src+=srcStride;
1552     }
1553 }
1554
1555
1556 #define QPEL_MC(r, OPNAME, RND, OP) \
1557 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1558     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1559     int i;\
1560     for(i=0; i<h; i++)\
1561     {\
1562         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1563         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1564         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1565         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1566         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1567         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1568         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1569         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1570         dst+=dstStride;\
1571         src+=srcStride;\
1572     }\
1573 }\
1574 \
1575 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1576     const int w=8;\
1577     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1578     int i;\
1579     for(i=0; i<w; i++)\
1580     {\
1581         const int src0= src[0*srcStride];\
1582         const int src1= src[1*srcStride];\
1583         const int src2= src[2*srcStride];\
1584         const int src3= src[3*srcStride];\
1585         const int src4= src[4*srcStride];\
1586         const int src5= src[5*srcStride];\
1587         const int src6= src[6*srcStride];\
1588         const int src7= src[7*srcStride];\
1589         const int src8= src[8*srcStride];\
1590         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1591         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1592         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1593         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1594         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1595         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1596         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1597         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1598         dst++;\
1599         src++;\
1600     }\
1601 }\
1602 \
1603 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1604     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1605     int i;\
1606     \
1607     for(i=0; i<h; i++)\
1608     {\
1609         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1610         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1611         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1612         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1613         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1614         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1615         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1616         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1617         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1618         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1619         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1620         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1621         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1622         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1623         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1624         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1625         dst+=dstStride;\
1626         src+=srcStride;\
1627     }\
1628 }\
1629 \
1630 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1631     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1632     int i;\
1633     const int w=16;\
1634     for(i=0; i<w; i++)\
1635     {\
1636         const int src0= src[0*srcStride];\
1637         const int src1= src[1*srcStride];\
1638         const int src2= src[2*srcStride];\
1639         const int src3= src[3*srcStride];\
1640         const int src4= src[4*srcStride];\
1641         const int src5= src[5*srcStride];\
1642         const int src6= src[6*srcStride];\
1643         const int src7= src[7*srcStride];\
1644         const int src8= src[8*srcStride];\
1645         const int src9= src[9*srcStride];\
1646         const int src10= src[10*srcStride];\
1647         const int src11= src[11*srcStride];\
1648         const int src12= src[12*srcStride];\
1649         const int src13= src[13*srcStride];\
1650         const int src14= src[14*srcStride];\
1651         const int src15= src[15*srcStride];\
1652         const int src16= src[16*srcStride];\
1653         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1654         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1655         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1656         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1657         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1658         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1659         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1660         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1661         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1662         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1663         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1664         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1665         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1666         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1667         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1668         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1669         dst++;\
1670         src++;\
1671     }\
1672 }\
1673 \
1674 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1675     OPNAME ## pixels8_c(dst, src, stride, 8);\
1676 }\
1677 \
1678 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1679     uint8_t half[64];\
1680     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1681     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1682 }\
1683 \
1684 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1685     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1686 }\
1687 \
1688 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1689     uint8_t half[64];\
1690     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1691     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1692 }\
1693 \
1694 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1695     uint8_t full[16*9];\
1696     uint8_t half[64];\
1697     copy_block9(full, src, 16, stride, 9);\
1698     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1699     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1700 }\
1701 \
1702 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1703     uint8_t full[16*9];\
1704     copy_block9(full, src, 16, stride, 9);\
1705     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1706 }\
1707 \
1708 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1709     uint8_t full[16*9];\
1710     uint8_t half[64];\
1711     copy_block9(full, src, 16, stride, 9);\
1712     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1713     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1714 }\
1715 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1716     uint8_t full[16*9];\
1717     uint8_t halfH[72];\
1718     uint8_t halfV[64];\
1719     uint8_t halfHV[64];\
1720     copy_block9(full, src, 16, stride, 9);\
1721     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1722     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1723     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1724     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1725 }\
1726 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1727     uint8_t full[16*9];\
1728     uint8_t halfH[72];\
1729     uint8_t halfHV[64];\
1730     copy_block9(full, src, 16, stride, 9);\
1731     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1732     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1733     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1734     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1735 }\
1736 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1737     uint8_t full[16*9];\
1738     uint8_t halfH[72];\
1739     uint8_t halfV[64];\
1740     uint8_t halfHV[64];\
1741     copy_block9(full, src, 16, stride, 9);\
1742     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1743     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1744     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1745     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1746 }\
1747 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1748     uint8_t full[16*9];\
1749     uint8_t halfH[72];\
1750     uint8_t halfHV[64];\
1751     copy_block9(full, src, 16, stride, 9);\
1752     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1753     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1754     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1755     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1756 }\
1757 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1758     uint8_t full[16*9];\
1759     uint8_t halfH[72];\
1760     uint8_t halfV[64];\
1761     uint8_t halfHV[64];\
1762     copy_block9(full, src, 16, stride, 9);\
1763     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1764     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1765     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1766     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1767 }\
1768 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1769     uint8_t full[16*9];\
1770     uint8_t halfH[72];\
1771     uint8_t halfHV[64];\
1772     copy_block9(full, src, 16, stride, 9);\
1773     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1774     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1775     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1776     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1777 }\
1778 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1779     uint8_t full[16*9];\
1780     uint8_t halfH[72];\
1781     uint8_t halfV[64];\
1782     uint8_t halfHV[64];\
1783     copy_block9(full, src, 16, stride, 9);\
1784     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1785     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1786     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1787     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1788 }\
1789 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1790     uint8_t full[16*9];\
1791     uint8_t halfH[72];\
1792     uint8_t halfHV[64];\
1793     copy_block9(full, src, 16, stride, 9);\
1794     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1795     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1796     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1797     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1798 }\
1799 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1800     uint8_t halfH[72];\
1801     uint8_t halfHV[64];\
1802     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1803     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1804     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1805 }\
1806 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1807     uint8_t halfH[72];\
1808     uint8_t halfHV[64];\
1809     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1810     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1811     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1812 }\
1813 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1814     uint8_t full[16*9];\
1815     uint8_t halfH[72];\
1816     uint8_t halfV[64];\
1817     uint8_t halfHV[64];\
1818     copy_block9(full, src, 16, stride, 9);\
1819     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1820     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1821     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1822     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1823 }\
1824 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1825     uint8_t full[16*9];\
1826     uint8_t halfH[72];\
1827     copy_block9(full, src, 16, stride, 9);\
1828     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1829     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1830     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1831 }\
1832 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1833     uint8_t full[16*9];\
1834     uint8_t halfH[72];\
1835     uint8_t halfV[64];\
1836     uint8_t halfHV[64];\
1837     copy_block9(full, src, 16, stride, 9);\
1838     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1839     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1840     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1841     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1842 }\
1843 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1844     uint8_t full[16*9];\
1845     uint8_t halfH[72];\
1846     copy_block9(full, src, 16, stride, 9);\
1847     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1848     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1849     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1850 }\
1851 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1852     uint8_t halfH[72];\
1853     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1854     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1855 }\
1856 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1857     OPNAME ## pixels16_c(dst, src, stride, 16);\
1858 }\
1859 \
1860 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1861     uint8_t half[256];\
1862     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1863     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1864 }\
1865 \
1866 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1867     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1868 }\
1869 \
1870 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1871     uint8_t half[256];\
1872     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1873     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1874 }\
1875 \
1876 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1877     uint8_t full[24*17];\
1878     uint8_t half[256];\
1879     copy_block17(full, src, 24, stride, 17);\
1880     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1881     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1882 }\
1883 \
1884 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1885     uint8_t full[24*17];\
1886     copy_block17(full, src, 24, stride, 17);\
1887     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1888 }\
1889 \
1890 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1891     uint8_t full[24*17];\
1892     uint8_t half[256];\
1893     copy_block17(full, src, 24, stride, 17);\
1894     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1895     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1896 }\
1897 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1898     uint8_t full[24*17];\
1899     uint8_t halfH[272];\
1900     uint8_t halfV[256];\
1901     uint8_t halfHV[256];\
1902     copy_block17(full, src, 24, stride, 17);\
1903     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1904     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1905     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1906     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1907 }\
1908 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1909     uint8_t full[24*17];\
1910     uint8_t halfH[272];\
1911     uint8_t halfHV[256];\
1912     copy_block17(full, src, 24, stride, 17);\
1913     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1914     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1915     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1916     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1917 }\
1918 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1919     uint8_t full[24*17];\
1920     uint8_t halfH[272];\
1921     uint8_t halfV[256];\
1922     uint8_t halfHV[256];\
1923     copy_block17(full, src, 24, stride, 17);\
1924     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1925     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1926     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1927     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1928 }\
1929 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1930     uint8_t full[24*17];\
1931     uint8_t halfH[272];\
1932     uint8_t halfHV[256];\
1933     copy_block17(full, src, 24, stride, 17);\
1934     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1935     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1936     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1937     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1938 }\
1939 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1940     uint8_t full[24*17];\
1941     uint8_t halfH[272];\
1942     uint8_t halfV[256];\
1943     uint8_t halfHV[256];\
1944     copy_block17(full, src, 24, stride, 17);\
1945     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1946     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1947     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1948     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1949 }\
1950 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1951     uint8_t full[24*17];\
1952     uint8_t halfH[272];\
1953     uint8_t halfHV[256];\
1954     copy_block17(full, src, 24, stride, 17);\
1955     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1956     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1957     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1958     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1959 }\
1960 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1961     uint8_t full[24*17];\
1962     uint8_t halfH[272];\
1963     uint8_t halfV[256];\
1964     uint8_t halfHV[256];\
1965     copy_block17(full, src, 24, stride, 17);\
1966     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1967     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1968     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1969     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1970 }\
1971 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1972     uint8_t full[24*17];\
1973     uint8_t halfH[272];\
1974     uint8_t halfHV[256];\
1975     copy_block17(full, src, 24, stride, 17);\
1976     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1977     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1978     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1979     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1980 }\
1981 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1982     uint8_t halfH[272];\
1983     uint8_t halfHV[256];\
1984     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1985     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1986     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1987 }\
1988 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1989     uint8_t halfH[272];\
1990     uint8_t halfHV[256];\
1991     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1992     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1993     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1994 }\
1995 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1996     uint8_t full[24*17];\
1997     uint8_t halfH[272];\
1998     uint8_t halfV[256];\
1999     uint8_t halfHV[256];\
2000     copy_block17(full, src, 24, stride, 17);\
2001     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2002     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2003     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2004     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2005 }\
2006 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2007     uint8_t full[24*17];\
2008     uint8_t halfH[272];\
2009     copy_block17(full, src, 24, stride, 17);\
2010     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2011     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2012     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2013 }\
2014 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2015     uint8_t full[24*17];\
2016     uint8_t halfH[272];\
2017     uint8_t halfV[256];\
2018     uint8_t halfHV[256];\
2019     copy_block17(full, src, 24, stride, 17);\
2020     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2021     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2022     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2023     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2024 }\
2025 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2026     uint8_t full[24*17];\
2027     uint8_t halfH[272];\
2028     copy_block17(full, src, 24, stride, 17);\
2029     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2030     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2031     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2032 }\
2033 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2034     uint8_t halfH[272];\
2035     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2036     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2037 }
2038
2039 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2040 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2041 #define op_put(a, b) a = cm[((b) + 16)>>5]
2042 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2043
2044 QPEL_MC(0, put_       , _       , op_put)
2045 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2046 QPEL_MC(0, avg_       , _       , op_avg)
2047 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2048 #undef op_avg
2049 #undef op_avg_no_rnd
2050 #undef op_put
2051 #undef op_put_no_rnd
2052
2053 #if 1
2054 #define H264_LOWPASS(OPNAME, OP, OP2) \
2055 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2056     const int h=4;\
2057     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2058     int i;\
2059     for(i=0; i<h; i++)\
2060     {\
2061         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2062         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2063         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2064         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2065         dst+=dstStride;\
2066         src+=srcStride;\
2067     }\
2068 }\
2069 \
2070 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2071     const int w=4;\
2072     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2073     int i;\
2074     for(i=0; i<w; i++)\
2075     {\
2076         const int srcB= src[-2*srcStride];\
2077         const int srcA= src[-1*srcStride];\
2078         const int src0= src[0 *srcStride];\
2079         const int src1= src[1 *srcStride];\
2080         const int src2= src[2 *srcStride];\
2081         const int src3= src[3 *srcStride];\
2082         const int src4= src[4 *srcStride];\
2083         const int src5= src[5 *srcStride];\
2084         const int src6= src[6 *srcStride];\
2085         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2086         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2087         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2088         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2089         dst++;\
2090         src++;\
2091     }\
2092 }\
2093 \
2094 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2095     const int h=4;\
2096     const int w=4;\
2097     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2098     int i;\
2099     src -= 2*srcStride;\
2100     for(i=0; i<h+5; i++)\
2101     {\
2102         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2103         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2104         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2105         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2106         tmp+=tmpStride;\
2107         src+=srcStride;\
2108     }\
2109     tmp -= tmpStride*(h+5-2);\
2110     for(i=0; i<w; i++)\
2111     {\
2112         const int tmpB= tmp[-2*tmpStride];\
2113         const int tmpA= tmp[-1*tmpStride];\
2114         const int tmp0= tmp[0 *tmpStride];\
2115         const int tmp1= tmp[1 *tmpStride];\
2116         const int tmp2= tmp[2 *tmpStride];\
2117         const int tmp3= tmp[3 *tmpStride];\
2118         const int tmp4= tmp[4 *tmpStride];\
2119         const int tmp5= tmp[5 *tmpStride];\
2120         const int tmp6= tmp[6 *tmpStride];\
2121         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2122         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2123         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2124         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2125         dst++;\
2126         tmp++;\
2127     }\
2128 }\
2129 \
2130 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2131     const int h=8;\
2132     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2133     int i;\
2134     for(i=0; i<h; i++)\
2135     {\
2136         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2137         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2138         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2139         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2140         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2141         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2142         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2143         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2144         dst+=dstStride;\
2145         src+=srcStride;\
2146     }\
2147 }\
2148 \
2149 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2150     const int w=8;\
2151     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2152     int i;\
2153     for(i=0; i<w; i++)\
2154     {\
2155         const int srcB= src[-2*srcStride];\
2156         const int srcA= src[-1*srcStride];\
2157         const int src0= src[0 *srcStride];\
2158         const int src1= src[1 *srcStride];\
2159         const int src2= src[2 *srcStride];\
2160         const int src3= src[3 *srcStride];\
2161         const int src4= src[4 *srcStride];\
2162         const int src5= src[5 *srcStride];\
2163         const int src6= src[6 *srcStride];\
2164         const int src7= src[7 *srcStride];\
2165         const int src8= src[8 *srcStride];\
2166         const int src9= src[9 *srcStride];\
2167         const int src10=src[10*srcStride];\
2168         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2169         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2170         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2171         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2172         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2173         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2174         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2175         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2176         dst++;\
2177         src++;\
2178     }\
2179 }\
2180 \
2181 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2182     const int h=8;\
2183     const int w=8;\
2184     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2185     int i;\
2186     src -= 2*srcStride;\
2187     for(i=0; i<h+5; i++)\
2188     {\
2189         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2190         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2191         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2192         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2193         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2194         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2195         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2196         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2197         tmp+=tmpStride;\
2198         src+=srcStride;\
2199     }\
2200     tmp -= tmpStride*(h+5-2);\
2201     for(i=0; i<w; i++)\
2202     {\
2203         const int tmpB= tmp[-2*tmpStride];\
2204         const int tmpA= tmp[-1*tmpStride];\
2205         const int tmp0= tmp[0 *tmpStride];\
2206         const int tmp1= tmp[1 *tmpStride];\
2207         const int tmp2= tmp[2 *tmpStride];\
2208         const int tmp3= tmp[3 *tmpStride];\
2209         const int tmp4= tmp[4 *tmpStride];\
2210         const int tmp5= tmp[5 *tmpStride];\
2211         const int tmp6= tmp[6 *tmpStride];\
2212         const int tmp7= tmp[7 *tmpStride];\
2213         const int tmp8= tmp[8 *tmpStride];\
2214         const int tmp9= tmp[9 *tmpStride];\
2215         const int tmp10=tmp[10*tmpStride];\
2216         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2217         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2218         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2219         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2220         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2221         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2222         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2223         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2224         dst++;\
2225         tmp++;\
2226     }\
2227 }\
2228 \
2229 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2230     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2231     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2232     src += 8*srcStride;\
2233     dst += 8*dstStride;\
2234     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2235     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2236 }\
2237 \
2238 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2239     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2240     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2241     src += 8*srcStride;\
2242     dst += 8*dstStride;\
2243     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2244     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2245 }\
2246 \
2247 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2248     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2249     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2250     src += 8*srcStride;\
2251     dst += 8*dstStride;\
2252     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2253     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2254 }\
2255
2256 #define H264_MC(OPNAME, SIZE) \
2257 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2258     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2259 }\
2260 \
2261 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2262     uint8_t half[SIZE*SIZE];\
2263     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2264     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2265 }\
2266 \
2267 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2268     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2269 }\
2270 \
2271 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2272     uint8_t half[SIZE*SIZE];\
2273     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2274     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2275 }\
2276 \
2277 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2278     uint8_t full[SIZE*(SIZE+5)];\
2279     uint8_t * const full_mid= full + SIZE*2;\
2280     uint8_t half[SIZE*SIZE];\
2281     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2282     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2283     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2284 }\
2285 \
2286 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2287     uint8_t full[SIZE*(SIZE+5)];\
2288     uint8_t * const full_mid= full + SIZE*2;\
2289     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2290     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2291 }\
2292 \
2293 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2294     uint8_t full[SIZE*(SIZE+5)];\
2295     uint8_t * const full_mid= full + SIZE*2;\
2296     uint8_t half[SIZE*SIZE];\
2297     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2298     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2299     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2300 }\
2301 \
2302 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2303     uint8_t full[SIZE*(SIZE+5)];\
2304     uint8_t * const full_mid= full + SIZE*2;\
2305     uint8_t halfH[SIZE*SIZE];\
2306     uint8_t halfV[SIZE*SIZE];\
2307     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2308     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2309     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2310     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2311 }\
2312 \
2313 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2314     uint8_t full[SIZE*(SIZE+5)];\
2315     uint8_t * const full_mid= full + SIZE*2;\
2316     uint8_t halfH[SIZE*SIZE];\
2317     uint8_t halfV[SIZE*SIZE];\
2318     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2319     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2320     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2321     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2322 }\
2323 \
2324 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2325     uint8_t full[SIZE*(SIZE+5)];\
2326     uint8_t * const full_mid= full + SIZE*2;\
2327     uint8_t halfH[SIZE*SIZE];\
2328     uint8_t halfV[SIZE*SIZE];\
2329     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2330     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2331     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2332     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2333 }\
2334 \
2335 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2336     uint8_t full[SIZE*(SIZE+5)];\
2337     uint8_t * const full_mid= full + SIZE*2;\
2338     uint8_t halfH[SIZE*SIZE];\
2339     uint8_t halfV[SIZE*SIZE];\
2340     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2341     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2342     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2343     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2344 }\
2345 \
2346 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2347     int16_t tmp[SIZE*(SIZE+5)];\
2348     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2349 }\
2350 \
2351 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2352     int16_t tmp[SIZE*(SIZE+5)];\
2353     uint8_t halfH[SIZE*SIZE];\
2354     uint8_t halfHV[SIZE*SIZE];\
2355     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2356     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2357     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2358 }\
2359 \
2360 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2361     int16_t tmp[SIZE*(SIZE+5)];\
2362     uint8_t halfH[SIZE*SIZE];\
2363     uint8_t halfHV[SIZE*SIZE];\
2364     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2365     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2366     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2367 }\
2368 \
2369 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2370     uint8_t full[SIZE*(SIZE+5)];\
2371     uint8_t * const full_mid= full + SIZE*2;\
2372     int16_t tmp[SIZE*(SIZE+5)];\
2373     uint8_t halfV[SIZE*SIZE];\
2374     uint8_t halfHV[SIZE*SIZE];\
2375     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2376     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2377     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2378     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2379 }\
2380 \
2381 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2382     uint8_t full[SIZE*(SIZE+5)];\
2383     uint8_t * const full_mid= full + SIZE*2;\
2384     int16_t tmp[SIZE*(SIZE+5)];\
2385     uint8_t halfV[SIZE*SIZE];\
2386     uint8_t halfHV[SIZE*SIZE];\
2387     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2388     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2389     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2390     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2391 }\
2392
2393 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2394 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2395 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2396 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2397 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2398
2399 H264_LOWPASS(put_       , op_put, op2_put)
2400 H264_LOWPASS(avg_       , op_avg, op2_avg)
2401 H264_MC(put_, 4)
2402 H264_MC(put_, 8)
2403 H264_MC(put_, 16)
2404 H264_MC(avg_, 4)
2405 H264_MC(avg_, 8)
2406 H264_MC(avg_, 16)
2407
2408 #undef op_avg
2409 #undef op_put
2410 #undef op2_avg
2411 #undef op2_put
2412 #endif
2413
2414 #define op_scale1(x)  block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2415 #define op_scale2(x)  dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2416 #define H264_WEIGHT(W,H) \
2417 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2418     int attribute_unused x, y; \
2419     offset <<= log2_denom; \
2420     if(log2_denom) offset += 1<<(log2_denom-1); \
2421     for(y=0; y<H; y++, block += stride){ \
2422         op_scale1(0); \
2423         op_scale1(1); \
2424         if(W==2) continue; \
2425         op_scale1(2); \
2426         op_scale1(3); \
2427         if(W==4) continue; \
2428         op_scale1(4); \
2429         op_scale1(5); \
2430         op_scale1(6); \
2431         op_scale1(7); \
2432         if(W==8) continue; \
2433         op_scale1(8); \
2434         op_scale1(9); \
2435         op_scale1(10); \
2436         op_scale1(11); \
2437         op_scale1(12); \
2438         op_scale1(13); \
2439         op_scale1(14); \
2440         op_scale1(15); \
2441     } \
2442 } \
2443 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
2444     int attribute_unused x, y; \
2445     int offset = (offsets + offsetd + 1) >> 1; \
2446     offset = ((offset << 1) + 1) << log2_denom; \
2447     for(y=0; y<H; y++, dst += stride, src += stride){ \
2448         op_scale2(0); \
2449         op_scale2(1); \
2450         if(W==2) continue; \
2451         op_scale2(2); \
2452         op_scale2(3); \
2453         if(W==4) continue; \
2454         op_scale2(4); \
2455         op_scale2(5); \
2456         op_scale2(6); \
2457         op_scale2(7); \
2458         if(W==8) continue; \
2459         op_scale2(8); \
2460         op_scale2(9); \
2461         op_scale2(10); \
2462         op_scale2(11); \
2463         op_scale2(12); \
2464         op_scale2(13); \
2465         op_scale2(14); \
2466         op_scale2(15); \
2467     } \
2468 }
2469
2470 H264_WEIGHT(16,16)
2471 H264_WEIGHT(16,8)
2472 H264_WEIGHT(8,16)
2473 H264_WEIGHT(8,8)
2474 H264_WEIGHT(8,4)
2475 H264_WEIGHT(4,8)
2476 H264_WEIGHT(4,4)
2477 H264_WEIGHT(4,2)
2478 H264_WEIGHT(2,4)
2479 H264_WEIGHT(2,2)
2480
2481 #undef op_scale1
2482 #undef op_scale2
2483 #undef H264_WEIGHT
2484
2485 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2486     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2487     int i;
2488
2489     for(i=0; i<h; i++){
2490         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2491         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2492         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2493         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2494         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2495         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2496         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2497         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2498         dst+=dstStride;
2499         src+=srcStride;
2500     }
2501 }
2502
2503 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2504     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2505     int i;
2506
2507     for(i=0; i<w; i++){
2508         const int src_1= src[ -srcStride];
2509         const int src0 = src[0          ];
2510         const int src1 = src[  srcStride];
2511         const int src2 = src[2*srcStride];
2512         const int src3 = src[3*srcStride];
2513         const int src4 = src[4*srcStride];
2514         const int src5 = src[5*srcStride];
2515         const int src6 = src[6*srcStride];
2516         const int src7 = src[7*srcStride];
2517         const int src8 = src[8*srcStride];
2518         const int src9 = src[9*srcStride];
2519         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2520         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2521         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2522         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2523         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2524         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2525         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2526         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2527         src++;
2528         dst++;
2529     }
2530 }
2531
2532 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2533     put_pixels8_c(dst, src, stride, 8);
2534 }
2535
2536 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2537     uint8_t half[64];
2538     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2539     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2540 }
2541
2542 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2543     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2544 }
2545
2546 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2547     uint8_t half[64];
2548     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2549     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2550 }
2551
2552 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2553     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2554 }
2555
2556 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2557     uint8_t halfH[88];
2558     uint8_t halfV[64];
2559     uint8_t halfHV[64];
2560     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2561     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2562     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2563     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2564 }
2565 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2566     uint8_t halfH[88];
2567     uint8_t halfV[64];
2568     uint8_t halfHV[64];
2569     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2570     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2571     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2572     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2573 }
2574 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2575     uint8_t halfH[88];
2576     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2577     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2578 }
2579
2580 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2581     int x;
2582     const int strength= ff_h263_loop_filter_strength[qscale];
2583
2584     for(x=0; x<8; x++){
2585         int d1, d2, ad1;
2586         int p0= src[x-2*stride];
2587         int p1= src[x-1*stride];
2588         int p2= src[x+0*stride];
2589         int p3= src[x+1*stride];
2590         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2591
2592         if     (d<-2*strength) d1= 0;
2593         else if(d<-  strength) d1=-2*strength - d;
2594         else if(d<   strength) d1= d;
2595         else if(d< 2*strength) d1= 2*strength - d;
2596         else                   d1= 0;
2597
2598         p1 += d1;
2599         p2 -= d1;
2600         if(p1&256) p1= ~(p1>>31);
2601         if(p2&256) p2= ~(p2>>31);
2602
2603         src[x-1*stride] = p1;
2604         src[x+0*stride] = p2;
2605
2606         ad1= ABS(d1)>>1;
2607
2608         d2= clip((p0-p3)/4, -ad1, ad1);
2609
2610         src[x-2*stride] = p0 - d2;
2611         src[x+  stride] = p3 + d2;
2612     }
2613 }
2614
2615 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2616     int y;
2617     const int strength= ff_h263_loop_filter_strength[qscale];
2618
2619     for(y=0; y<8; y++){
2620         int d1, d2, ad1;
2621         int p0= src[y*stride-2];
2622         int p1= src[y*stride-1];
2623         int p2= src[y*stride+0];
2624         int p3= src[y*stride+1];
2625         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2626
2627         if     (d<-2*strength) d1= 0;
2628         else if(d<-  strength) d1=-2*strength - d;
2629         else if(d<   strength) d1= d;
2630         else if(d< 2*strength) d1= 2*strength - d;
2631         else                   d1= 0;
2632
2633         p1 += d1;
2634         p2 -= d1;
2635         if(p1&256) p1= ~(p1>>31);
2636         if(p2&256) p2= ~(p2>>31);
2637
2638         src[y*stride-1] = p1;
2639         src[y*stride+0] = p2;
2640
2641         ad1= ABS(d1)>>1;
2642
2643         d2= clip((p0-p3)/4, -ad1, ad1);
2644
2645         src[y*stride-2] = p0 - d2;
2646         src[y*stride+1] = p3 + d2;
2647     }
2648 }
2649
2650 static void h261_loop_filter_c(uint8_t *src, int stride){
2651     int x,y,xy,yz;
2652     int temp[64];
2653
2654     for(x=0; x<8; x++){
2655         temp[x      ] = 4*src[x           ];
2656         temp[x + 7*8] = 4*src[x + 7*stride];
2657     }
2658     for(y=1; y<7; y++){
2659         for(x=0; x<8; x++){
2660             xy = y * stride + x;
2661             yz = y * 8 + x;
2662             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2663         }
2664     }
2665
2666     for(y=0; y<8; y++){
2667         src[  y*stride] = (temp[  y*8] + 2)>>2;
2668         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2669         for(x=1; x<7; x++){
2670             xy = y * stride + x;
2671             yz = y * 8 + x;
2672             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2673         }
2674     }
2675 }
2676
2677 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2678 {
2679     int i, d;
2680     for( i = 0; i < 4; i++ ) {
2681         if( tc0[i] < 0 ) {
2682             pix += 4*ystride;
2683             continue;
2684         }
2685         for( d = 0; d < 4; d++ ) {
2686             const int p0 = pix[-1*xstride];
2687             const int p1 = pix[-2*xstride];
2688             const int p2 = pix[-3*xstride];
2689             const int q0 = pix[0];
2690             const int q1 = pix[1*xstride];
2691             const int q2 = pix[2*xstride];
2692
2693             if( ABS( p0 - q0 ) < alpha &&
2694                 ABS( p1 - p0 ) < beta &&
2695                 ABS( q1 - q0 ) < beta ) {
2696
2697                 int tc = tc0[i];
2698                 int i_delta;
2699
2700                 if( ABS( p2 - p0 ) < beta ) {
2701                     pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2702                     tc++;
2703                 }
2704                 if( ABS( q2 - q0 ) < beta ) {
2705                     pix[   xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2706                     tc++;
2707                 }
2708
2709                 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2710                 pix[-xstride] = clip_uint8( p0 + i_delta );    /* p0' */
2711                 pix[0]        = clip_uint8( q0 - i_delta );    /* q0' */
2712             }
2713             pix += ystride;
2714         }
2715     }
2716 }
2717 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2718 {
2719     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2720 }
2721 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2722 {
2723     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2724 }
2725
2726 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2727 {
2728     int i, d;
2729     for( i = 0; i < 4; i++ ) {
2730         const int tc = tc0[i];
2731         if( tc <= 0 ) {
2732             pix += 2*ystride;
2733             continue;
2734         }
2735         for( d = 0; d < 2; d++ ) {
2736             const int p0 = pix[-1*xstride];
2737             const int p1 = pix[-2*xstride];
2738             const int q0 = pix[0];
2739             const int q1 = pix[1*xstride];
2740
2741             if( ABS( p0 - q0 ) < alpha &&
2742                 ABS( p1 - p0 ) < beta &&
2743                 ABS( q1 - q0 ) < beta ) {
2744
2745                 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2746
2747                 pix[-xstride] = clip_uint8( p0 + delta );    /* p0' */
2748                 pix[0]        = clip_uint8( q0 - delta );    /* q0' */
2749             }
2750             pix += ystride;
2751         }
2752     }
2753 }
2754 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2755 {
2756     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2757 }
2758 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2759 {
2760     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2761 }
2762
2763 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2764 {
2765     int d;
2766     for( d = 0; d < 8; d++ ) {
2767         const int p0 = pix[-1*xstride];
2768         const int p1 = pix[-2*xstride];
2769         const int q0 = pix[0];
2770         const int q1 = pix[1*xstride];
2771
2772         if( ABS( p0 - q0 ) < alpha &&
2773             ABS( p1 - p0 ) < beta &&
2774             ABS( q1 - q0 ) < beta ) {
2775
2776             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2777             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2778         }
2779         pix += ystride;
2780     }
2781 }
2782 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2783 {
2784     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2785 }
2786 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2787 {
2788     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2789 }
2790
2791 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2792 {
2793     int s, i;
2794
2795     s = 0;
2796     for(i=0;i<h;i++) {
2797         s += abs(pix1[0] - pix2[0]);
2798         s += abs(pix1[1] - pix2[1]);
2799         s += abs(pix1[2] - pix2[2]);
2800         s += abs(pix1[3] - pix2[3]);
2801         s += abs(pix1[4] - pix2[4]);
2802         s += abs(pix1[5] - pix2[5]);
2803         s += abs(pix1[6] - pix2[6]);
2804         s += abs(pix1[7] - pix2[7]);
2805         s += abs(pix1[8] - pix2[8]);
2806         s += abs(pix1[9] - pix2[9]);
2807         s += abs(pix1[10] - pix2[10]);
2808         s += abs(pix1[11] - pix2[11]);
2809         s += abs(pix1[12] - pix2[12]);
2810         s += abs(pix1[13] - pix2[13]);
2811         s += abs(pix1[14] - pix2[14]);
2812         s += abs(pix1[15] - pix2[15]);
2813         pix1 += line_size;
2814         pix2 += line_size;
2815     }
2816     return s;
2817 }
2818
2819 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2820 {
2821     int s, i;
2822
2823     s = 0;
2824     for(i=0;i<h;i++) {
2825         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2826         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2827         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2828         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2829         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2830         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2831         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2832         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2833         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2834         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2835         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2836         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2837         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2838         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2839         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2840         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2841         pix1 += line_size;
2842         pix2 += line_size;
2843     }
2844     return s;
2845 }
2846
2847 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2848 {
2849     int s, i;
2850     uint8_t *pix3 = pix2 + line_size;
2851
2852     s = 0;
2853     for(i=0;i<h;i++) {
2854         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2855         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2856         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2857         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2858         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2859         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2860         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2861         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2862         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2863         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2864         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2865         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2866         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2867         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2868         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2869         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2870         pix1 += line_size;
2871         pix2 += line_size;
2872         pix3 += line_size;
2873     }
2874     return s;
2875 }
2876
2877 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2878 {
2879     int s, i;
2880     uint8_t *pix3 = pix2 + line_size;
2881
2882     s = 0;
2883     for(i=0;i<h;i++) {
2884         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2885         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2886         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2887         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2888         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2889         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2890         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2891         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2892         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2893         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2894         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2895         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2896         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2897         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2898         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2899         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2900         pix1 += line_size;
2901         pix2 += line_size;
2902         pix3 += line_size;
2903     }
2904     return s;
2905 }
2906
2907 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2908 {
2909     int s, i;
2910
2911     s = 0;
2912     for(i=0;i<h;i++) {
2913         s += abs(pix1[0] - pix2[0]);
2914         s += abs(pix1[1] - pix2[1]);
2915         s += abs(pix1[2] - pix2[2]);
2916         s += abs(pix1[3] - pix2[3]);
2917         s += abs(pix1[4] - pix2[4]);
2918         s += abs(pix1[5] - pix2[5]);
2919         s += abs(pix1[6] - pix2[6]);
2920         s += abs(pix1[7] - pix2[7]);
2921         pix1 += line_size;
2922         pix2 += line_size;
2923     }
2924     return s;
2925 }
2926
2927 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2928 {
2929     int s, i;
2930
2931     s = 0;
2932     for(i=0;i<h;i++) {
2933         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2934         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2935         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2936         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2937         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2938         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2939         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2940         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2941         pix1 += line_size;
2942         pix2 += line_size;
2943     }
2944     return s;
2945 }
2946
2947 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2948 {
2949     int s, i;
2950     uint8_t *pix3 = pix2 + line_size;
2951
2952     s = 0;
2953     for(i=0;i<h;i++) {
2954         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2955         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2956         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2957         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2958         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2959         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2960         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2961         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2962         pix1 += line_size;
2963         pix2 += line_size;
2964         pix3 += line_size;
2965     }
2966     return s;
2967 }
2968
2969 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2970 {
2971     int s, i;
2972     uint8_t *pix3 = pix2 + line_size;
2973
2974     s = 0;
2975     for(i=0;i<h;i++) {
2976         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2977         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2978         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2979         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2980         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2981         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2982         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2983         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2984         pix1 += line_size;
2985         pix2 += line_size;
2986         pix3 += line_size;
2987     }
2988     return s;
2989 }
2990
2991 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2992     MpegEncContext *c = v;
2993     int score1=0;
2994     int score2=0;
2995     int x,y;
2996
2997     for(y=0; y<h; y++){
2998         for(x=0; x<16; x++){
2999             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3000         }
3001         if(y+1<h){
3002             for(x=0; x<15; x++){
3003                 score2+= ABS(  s1[x  ] - s1[x  +stride]
3004                              - s1[x+1] + s1[x+1+stride])
3005                         -ABS(  s2[x  ] - s2[x  +stride]
3006                              - s2[x+1] + s2[x+1+stride]);
3007             }
3008         }
3009         s1+= stride;
3010         s2+= stride;
3011     }
3012
3013     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3014     else  return score1 + ABS(score2)*8;
3015 }
3016
3017 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3018     MpegEncContext *c = v;
3019     int score1=0;
3020     int score2=0;
3021     int x,y;
3022
3023     for(y=0; y<h; y++){
3024         for(x=0; x<8; x++){
3025             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3026         }
3027         if(y+1<h){
3028             for(x=0; x<7; x++){
3029                 score2+= ABS(  s1[x  ] - s1[x  +stride]
3030                              - s1[x+1] + s1[x+1+stride])
3031                         -ABS(  s2[x  ] - s2[x  +stride]
3032                              - s2[x+1] + s2[x+1+stride]);
3033             }
3034         }
3035         s1+= stride;
3036         s2+= stride;
3037     }
3038
3039     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3040     else  return score1 + ABS(score2)*8;
3041 }
3042
3043 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3044     int i;
3045     unsigned int sum=0;
3046
3047     for(i=0; i<8*8; i++){
3048         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3049         int w= weight[i];
3050         b>>= RECON_SHIFT;
3051         assert(-512<b && b<512);
3052
3053         sum += (w*b)*(w*b)>>4;
3054     }
3055     return sum>>2;
3056 }
3057
3058 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3059     int i;
3060
3061     for(i=0; i<8*8; i++){
3062         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3063     }
3064 }
3065
3066 /**
3067  * permutes an 8x8 block.
3068  * @param block the block which will be permuted according to the given permutation vector
3069  * @param permutation the permutation vector
3070  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3071  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3072  *                  (inverse) permutated to scantable order!
3073  */
3074 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3075 {
3076     int i;
3077     DCTELEM temp[64];
3078
3079     if(last<=0) return;
3080     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3081
3082     for(i=0; i<=last; i++){
3083         const int j= scantable[i];
3084         temp[j]= block[j];
3085         block[j]=0;
3086     }
3087
3088     for(i=0; i<=last; i++){
3089         const int j= scantable[i];
3090         const int perm_j= permutation[j];
3091         block[perm_j]= temp[j];
3092     }
3093 }
3094
3095 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3096     return 0;
3097 }
3098
3099 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3100     int i;
3101
3102     memset(cmp, 0, sizeof(void*)*5);
3103
3104     for(i=0; i<5; i++){
3105         switch(type&0xFF){
3106         case FF_CMP_SAD:
3107             cmp[i]= c->sad[i];
3108             break;
3109         case FF_CMP_SATD:
3110             cmp[i]= c->hadamard8_diff[i];
3111             break;
3112         case FF_CMP_SSE:
3113             cmp[i]= c->sse[i];
3114             break;
3115         case FF_CMP_DCT:
3116             cmp[i]= c->dct_sad[i];
3117             break;
3118         case FF_CMP_DCTMAX:
3119             cmp[i]= c->dct_max[i];
3120             break;
3121         case FF_CMP_PSNR:
3122             cmp[i]= c->quant_psnr[i];
3123             break;
3124         case FF_CMP_BIT:
3125             cmp[i]= c->bit[i];
3126             break;
3127         case FF_CMP_RD:
3128             cmp[i]= c->rd[i];
3129             break;
3130         case FF_CMP_VSAD:
3131             cmp[i]= c->vsad[i];
3132             break;
3133         case FF_CMP_VSSE:
3134             cmp[i]= c->vsse[i];
3135             break;
3136         case FF_CMP_ZERO:
3137             cmp[i]= zero_cmp;
3138             break;
3139         case FF_CMP_NSSE:
3140             cmp[i]= c->nsse[i];
3141             break;
3142         case FF_CMP_W53:
3143             cmp[i]= c->w53[i];
3144             break;
3145         case FF_CMP_W97:
3146             cmp[i]= c->w97[i];
3147             break;
3148         default:
3149             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3150         }
3151     }
3152 }
3153
3154 /**
3155  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3156  */
3157 static void clear_blocks_c(DCTELEM *blocks)
3158 {
3159     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3160 }
3161
3162 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3163     int i;
3164     for(i=0; i+7<w; i+=8){
3165         dst[i+0] += src[i+0];
3166         dst[i+1] += src[i+1];
3167         dst[i+2] += src[i+2];
3168         dst[i+3] += src[i+3];
3169         dst[i+4] += src[i+4];
3170         dst[i+5] += src[i+5];
3171         dst[i+6] += src[i+6];
3172         dst[i+7] += src[i+7];
3173     }
3174     for(; i<w; i++)
3175         dst[i+0] += src[i+0];
3176 }
3177
3178 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3179     int i;
3180     for(i=0; i+7<w; i+=8){
3181         dst[i+0] = src1[i+0]-src2[i+0];
3182         dst[i+1] = src1[i+1]-src2[i+1];
3183         dst[i+2] = src1[i+2]-src2[i+2];
3184         dst[i+3] = src1[i+3]-src2[i+3];
3185         dst[i+4] = src1[i+4]-src2[i+4];
3186         dst[i+5] = src1[i+5]-src2[i+5];
3187         dst[i+6] = src1[i+6]-src2[i+6];
3188         dst[i+7] = src1[i+7]-src2[i+7];
3189     }
3190     for(; i<w; i++)
3191         dst[i+0] = src1[i+0]-src2[i+0];
3192 }
3193
3194 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3195     int i;
3196     uint8_t l, lt;
3197
3198     l= *left;
3199     lt= *left_top;
3200
3201     for(i=0; i<w; i++){
3202         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3203         lt= src1[i];
3204         l= src2[i];
3205         dst[i]= l - pred;
3206     }
3207
3208     *left= l;
3209     *left_top= lt;
3210 }
3211
3212 #define BUTTERFLY2(o1,o2,i1,i2) \
3213 o1= (i1)+(i2);\
3214 o2= (i1)-(i2);
3215
3216 #define BUTTERFLY1(x,y) \
3217 {\
3218     int a,b;\
3219     a= x;\
3220     b= y;\
3221     x= a+b;\
3222     y= a-b;\
3223 }
3224
3225 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3226
3227 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3228     int i;
3229     int temp[64];
3230     int sum=0;
3231
3232     assert(h==8);
3233
3234     for(i=0; i<8; i++){
3235         //FIXME try pointer walks
3236         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3237         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3238         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3239         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3240
3241         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3242         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3243         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3244         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3245
3246         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3247         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3248         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3249         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3250     }
3251
3252     for(i=0; i<8; i++){
3253         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3254         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3255         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3256         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3257
3258         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3259         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3260         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3261         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3262
3263         sum +=
3264              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3265             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3266             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3267             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3268     }
3269 #if 0
3270 static int maxi=0;
3271 if(sum>maxi){
3272     maxi=sum;
3273     printf("MAX:%d\n", maxi);
3274 }
3275 #endif
3276     return sum;
3277 }
3278
3279 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3280     int i;
3281     int temp[64];
3282     int sum=0;
3283
3284     assert(h==8);
3285
3286     for(i=0; i<8; i++){
3287         //FIXME try pointer walks
3288         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3289         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3290         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3291         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3292
3293         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3294         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3295         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3296         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3297
3298         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3299         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3300         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3301         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3302     }
3303
3304     for(i=0; i<8; i++){
3305         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3306         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3307         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3308         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3309
3310         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3311         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3312         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3313         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3314
3315         sum +=
3316              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3317             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3318             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3319             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3320     }
3321
3322     sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3323
3324     return sum;
3325 }
3326
3327 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3328     MpegEncContext * const s= (MpegEncContext *)c;
3329     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3330     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3331     int sum=0, i;
3332
3333     assert(h==8);
3334
3335     s->dsp.diff_pixels(temp, src1, src2, stride);
3336     s->dsp.fdct(temp);
3337
3338     for(i=0; i<64; i++)
3339         sum+= ABS(temp[i]);
3340
3341     return sum;
3342 }
3343
3344 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3345     MpegEncContext * const s= (MpegEncContext *)c;
3346     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3347     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3348     int sum=0, i;
3349
3350     assert(h==8);
3351
3352     s->dsp.diff_pixels(temp, src1, src2, stride);
3353     s->dsp.fdct(temp);
3354
3355     for(i=0; i<64; i++)
3356         sum= FFMAX(sum, ABS(temp[i]));
3357
3358     return sum;
3359 }
3360
3361 void simple_idct(DCTELEM *block); //FIXME
3362
3363 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3364     MpegEncContext * const s= (MpegEncContext *)c;
3365     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3366     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3367     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3368     int sum=0, i;
3369
3370     assert(h==8);
3371     s->mb_intra=0;
3372
3373     s->dsp.diff_pixels(temp, src1, src2, stride);
3374
3375     memcpy(bak, temp, 64*sizeof(DCTELEM));
3376
3377     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3378     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3379     simple_idct(temp); //FIXME
3380
3381     for(i=0; i<64; i++)
3382         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3383
3384     return sum;
3385 }
3386
3387 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3388     MpegEncContext * const s= (MpegEncContext *)c;
3389     const uint8_t *scantable= s->intra_scantable.permutated;
3390     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3391     uint64_t __align8 aligned_bak[stride];
3392     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3393     uint8_t * const bak= (uint8_t*)aligned_bak;
3394     int i, last, run, bits, level, distoration, start_i;
3395     const int esc_length= s->ac_esc_length;
3396     uint8_t * length;
3397     uint8_t * last_length;
3398
3399     assert(h==8);
3400
3401     for(i=0; i<8; i++){
3402         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3403         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3404     }
3405
3406     s->dsp.diff_pixels(temp, src1, src2, stride);
3407
3408     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3409
3410     bits=0;
3411
3412     if (s->mb_intra) {
3413         start_i = 1;
3414         length     = s->intra_ac_vlc_length;
3415         last_length= s->intra_ac_vlc_last_length;
3416         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3417     } else {
3418         start_i = 0;
3419         length     = s->inter_ac_vlc_length;
3420         last_length= s->inter_ac_vlc_last_length;
3421     }
3422
3423     if(last>=start_i){
3424         run=0;
3425         for(i=start_i; i<last; i++){
3426             int j= scantable[i];
3427             level= temp[j];
3428
3429             if(level){
3430                 level+=64;
3431                 if((level&(~127)) == 0){
3432                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3433                 }else
3434                     bits+= esc_length;
3435                 run=0;
3436             }else
3437                 run++;
3438         }
3439         i= scantable[last];
3440
3441         level= temp[i] + 64;
3442
3443         assert(level - 64);
3444
3445         if((level&(~127)) == 0){
3446             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3447         }else
3448             bits+= esc_length;
3449
3450     }
3451
3452     if(last>=0){
3453         if(s->mb_intra)
3454             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3455         else
3456             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3457     }
3458
3459     s->dsp.idct_add(bak, stride, temp);
3460
3461     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3462
3463     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3464 }
3465
3466 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3467     MpegEncContext * const s= (MpegEncContext *)c;
3468     const uint8_t *scantable= s->intra_scantable.permutated;
3469     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3470     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3471     int i, last, run, bits, level, start_i;
3472     const int esc_length= s->ac_esc_length;
3473     uint8_t * length;
3474     uint8_t * last_length;
3475
3476     assert(h==8);
3477
3478     s->dsp.diff_pixels(temp, src1, src2, stride);
3479
3480     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3481
3482     bits=0;
3483
3484     if (s->mb_intra) {
3485         start_i = 1;
3486         length     = s->intra_ac_vlc_length;
3487         last_length= s->intra_ac_vlc_last_length;
3488         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3489     } else {
3490         start_i = 0;
3491         length     = s->inter_ac_vlc_length;
3492         last_length= s->inter_ac_vlc_last_length;
3493     }
3494
3495     if(last>=start_i){
3496         run=0;
3497         for(i=start_i; i<last; i++){
3498             int j= scantable[i];
3499             level= temp[j];
3500
3501             if(level){
3502                 level+=64;
3503                 if((level&(~127)) == 0){
3504                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3505                 }else
3506                     bits+= esc_length;
3507                 run=0;
3508             }else
3509                 run++;
3510         }
3511         i= scantable[last];
3512
3513         level= temp[i] + 64;
3514
3515         assert(level - 64);
3516
3517         if((level&(~127)) == 0){
3518             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3519         }else
3520             bits+= esc_length;
3521     }
3522
3523     return bits;
3524 }
3525
3526 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3527     int score=0;
3528     int x,y;
3529
3530     for(y=1; y<h; y++){
3531         for(x=0; x<16; x+=4){
3532             score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride])
3533                    +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3534         }
3535         s+= stride;
3536     }
3537
3538     return score;
3539 }
3540
3541 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3542     int score=0;
3543     int x,y;
3544
3545     for(y=1; y<h; y++){
3546         for(x=0; x<16; x++){
3547             score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3548         }
3549         s1+= stride;
3550         s2+= stride;
3551     }
3552
3553     return score;
3554 }
3555
3556 #define SQ(a) ((a)*(a))
3557 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3558     int score=0;
3559     int x,y;
3560
3561     for(y=1; y<h; y++){
3562         for(x=0; x<16; x+=4){
3563             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3564                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3565         }
3566         s+= stride;
3567     }
3568
3569     return score;
3570 }
3571
3572 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3573     int score=0;
3574     int x,y;
3575
3576     for(y=1; y<h; y++){
3577         for(x=0; x<16; x++){
3578             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3579         }
3580         s1+= stride;
3581         s2+= stride;
3582     }
3583
3584     return score;
3585 }
3586
3587 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3588 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3589 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3590 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3591 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3592 WARPER8_16_SQ(rd8x8_c, rd16_c)
3593 WARPER8_16_SQ(bit8x8_c, bit16_c)
3594
3595 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3596  converted */
3597 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3598 {
3599     j_rev_dct (block);
3600     put_pixels_clamped_c(block, dest, line_size);
3601 }
3602 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3603 {
3604     j_rev_dct (block);
3605     add_pixels_clamped_c(block, dest, line_size);
3606 }
3607
3608 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3609 {
3610     j_rev_dct4 (block);
3611     put_pixels_clamped4_c(block, dest, line_size);
3612 }
3613 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3614 {
3615     j_rev_dct4 (block);
3616     add_pixels_clamped4_c(block, dest, line_size);
3617 }
3618
3619 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3620 {
3621     j_rev_dct2 (block);
3622     put_pixels_clamped2_c(block, dest, line_size);
3623 }
3624 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3625 {
3626     j_rev_dct2 (block);
3627     add_pixels_clamped2_c(block, dest, line_size);
3628 }
3629
3630 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3631 {
3632     uint8_t *cm = cropTbl + MAX_NEG_CROP;
3633
3634     dest[0] = cm[(block[0] + 4)>>3];
3635 }
3636 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3637 {
3638     uint8_t *cm = cropTbl + MAX_NEG_CROP;
3639
3640     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3641 }
3642
3643 /* init static data */
3644 void dsputil_static_init(void)
3645 {
3646     int i;
3647
3648     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3649     for(i=0;i<MAX_NEG_CROP;i++) {
3650         cropTbl[i] = 0;
3651         cropTbl[i + MAX_NEG_CROP + 256] = 255;
3652     }
3653
3654     for(i=0;i<512;i++) {
3655         squareTbl[i] = (i - 256) * (i - 256);
3656     }
3657
3658     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3659 }
3660
3661
3662 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3663 {
3664     int i;
3665
3666 #ifdef CONFIG_ENCODERS
3667     if(avctx->dct_algo==FF_DCT_FASTINT) {
3668         c->fdct = fdct_ifast;
3669         c->fdct248 = fdct_ifast248;
3670     }
3671     else if(avctx->dct_algo==FF_DCT_FAAN) {
3672         c->fdct = ff_faandct;
3673         c->fdct248 = ff_faandct248;
3674     }
3675     else {
3676         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3677         c->fdct248 = ff_fdct248_islow;
3678     }
3679 #endif //CONFIG_ENCODERS
3680
3681     if(avctx->lowres==1){
3682         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3683             c->idct_put= ff_jref_idct4_put;
3684             c->idct_add= ff_jref_idct4_add;
3685         }else{
3686             c->idct_put= ff_h264_lowres_idct_put_c;
3687             c->idct_add= ff_h264_lowres_idct_add_c;
3688         }
3689         c->idct    = j_rev_dct4;
3690         c->idct_permutation_type= FF_NO_IDCT_PERM;
3691     }else if(avctx->lowres==2){
3692         c->idct_put= ff_jref_idct2_put;
3693         c->idct_add= ff_jref_idct2_add;
3694         c->idct    = j_rev_dct2;
3695         c->idct_permutation_type= FF_NO_IDCT_PERM;
3696     }else if(avctx->lowres==3){
3697         c->idct_put= ff_jref_idct1_put;
3698         c->idct_add= ff_jref_idct1_add;
3699         c->idct    = j_rev_dct1;
3700         c->idct_permutation_type= FF_NO_IDCT_PERM;
3701     }else{
3702         if(avctx->idct_algo==FF_IDCT_INT){
3703             c->idct_put= ff_jref_idct_put;
3704             c->idct_add= ff_jref_idct_add;
3705             c->idct    = j_rev_dct;
3706             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3707         }else if(avctx->idct_algo==FF_IDCT_VP3){
3708             c->idct_put= ff_vp3_idct_put_c;
3709             c->idct_add= ff_vp3_idct_add_c;
3710             c->idct    = ff_vp3_idct_c;
3711             c->idct_permutation_type= FF_NO_IDCT_PERM;
3712         }else{ //accurate/default
3713             c->idct_put= simple_idct_put;
3714             c->idct_add= simple_idct_add;
3715             c->idct    = simple_idct;
3716             c->idct_permutation_type= FF_NO_IDCT_PERM;
3717         }
3718     }
3719
3720     c->h264_idct_add= ff_h264_idct_add_c;
3721     c->h264_idct8_add= ff_h264_idct8_add_c;
3722
3723     c->get_pixels = get_pixels_c;
3724     c->diff_pixels = diff_pixels_c;
3725     c->put_pixels_clamped = put_pixels_clamped_c;
3726     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3727     c->add_pixels_clamped = add_pixels_clamped_c;
3728     c->add_pixels8 = add_pixels8_c;
3729     c->add_pixels4 = add_pixels4_c;
3730     c->gmc1 = gmc1_c;
3731     c->gmc = gmc_c;
3732     c->clear_blocks = clear_blocks_c;
3733     c->pix_sum = pix_sum_c;
3734     c->pix_norm1 = pix_norm1_c;
3735
3736     /* TODO [0] 16  [1] 8 */
3737     c->pix_abs[0][0] = pix_abs16_c;
3738     c->pix_abs[0][1] = pix_abs16_x2_c;
3739     c->pix_abs[0][2] = pix_abs16_y2_c;
3740     c->pix_abs[0][3] = pix_abs16_xy2_c;
3741     c->pix_abs[1][0] = pix_abs8_c;
3742     c->pix_abs[1][1] = pix_abs8_x2_c;
3743     c->pix_abs[1][2] = pix_abs8_y2_c;
3744     c->pix_abs[1][3] = pix_abs8_xy2_c;
3745
3746 #define dspfunc(PFX, IDX, NUM) \
3747     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3748     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3749     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3750     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3751
3752     dspfunc(put, 0, 16);
3753     dspfunc(put_no_rnd, 0, 16);
3754     dspfunc(put, 1, 8);
3755     dspfunc(put_no_rnd, 1, 8);
3756     dspfunc(put, 2, 4);
3757     dspfunc(put, 3, 2);
3758
3759     dspfunc(avg, 0, 16);
3760     dspfunc(avg_no_rnd, 0, 16);
3761     dspfunc(avg, 1, 8);
3762     dspfunc(avg_no_rnd, 1, 8);
3763     dspfunc(avg, 2, 4);
3764     dspfunc(avg, 3, 2);
3765 #undef dspfunc
3766
3767     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3768     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3769
3770     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3771     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3772     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3773     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3774     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3775     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3776     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3777     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3778     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3779
3780     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3781     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3782     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3783     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3784     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3785     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3786     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3787     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3788     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3789
3790 #define dspfunc(PFX, IDX, NUM) \
3791     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3792     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3793     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3794     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3795     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3796     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3797     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3798     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3799     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3800     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3801     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3802     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3803     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3804     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3805     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3806     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3807
3808     dspfunc(put_qpel, 0, 16);
3809     dspfunc(put_no_rnd_qpel, 0, 16);
3810
3811     dspfunc(avg_qpel, 0, 16);
3812     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3813
3814     dspfunc(put_qpel, 1, 8);
3815     dspfunc(put_no_rnd_qpel, 1, 8);
3816
3817     dspfunc(avg_qpel, 1, 8);
3818     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3819
3820     dspfunc(put_h264_qpel, 0, 16);
3821     dspfunc(put_h264_qpel, 1, 8);
3822     dspfunc(put_h264_qpel, 2, 4);
3823     dspfunc(avg_h264_qpel, 0, 16);
3824     dspfunc(avg_h264_qpel, 1, 8);
3825     dspfunc(avg_h264_qpel, 2, 4);
3826
3827 #undef dspfunc
3828     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3829     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3830     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3831     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3832     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3833     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3834
3835     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3836     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3837     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3838     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3839     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3840     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3841     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3842     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3843     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3844     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3845     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3846     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3847     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3848     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3849     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3850     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3851     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3852     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3853     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3854     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3855
3856     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3857     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3858     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3859     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3860     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3861     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3862     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3863     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3864
3865 #define SET_CMP_FUNC(name) \
3866     c->name[0]= name ## 16_c;\
3867     c->name[1]= name ## 8x8_c;
3868
3869     SET_CMP_FUNC(hadamard8_diff)
3870     c->hadamard8_diff[4]= hadamard8_intra16_c;
3871     SET_CMP_FUNC(dct_sad)
3872     SET_CMP_FUNC(dct_max)
3873     c->sad[0]= pix_abs16_c;
3874     c->sad[1]= pix_abs8_c;
3875     c->sse[0]= sse16_c;
3876     c->sse[1]= sse8_c;
3877     c->sse[2]= sse4_c;
3878     SET_CMP_FUNC(quant_psnr)
3879     SET_CMP_FUNC(rd)
3880     SET_CMP_FUNC(bit)
3881     c->vsad[0]= vsad16_c;
3882     c->vsad[4]= vsad_intra16_c;
3883     c->vsse[0]= vsse16_c;
3884     c->vsse[4]= vsse_intra16_c;
3885     c->nsse[0]= nsse16_c;
3886     c->nsse[1]= nsse8_c;
3887     c->w53[0]= w53_16_c;
3888     c->w53[1]= w53_8_c;
3889     c->w97[0]= w97_16_c;
3890     c->w97[1]= w97_8_c;
3891
3892     c->add_bytes= add_bytes_c;
3893     c->diff_bytes= diff_bytes_c;
3894     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3895     c->bswap_buf= bswap_buf;
3896
3897     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
3898     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
3899     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
3900     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
3901     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
3902     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
3903
3904     c->h263_h_loop_filter= h263_h_loop_filter_c;
3905     c->h263_v_loop_filter= h263_v_loop_filter_c;
3906
3907     c->h261_loop_filter= h261_loop_filter_c;
3908
3909     c->try_8x8basis= try_8x8basis_c;
3910     c->add_8x8basis= add_8x8basis_c;
3911
3912 #ifdef HAVE_MMX
3913     dsputil_init_mmx(c, avctx);
3914 #endif
3915 #ifdef ARCH_ARMV4L
3916     dsputil_init_armv4l(c, avctx);
3917 #endif
3918 #ifdef HAVE_MLIB
3919     dsputil_init_mlib(c, avctx);
3920 #endif
3921 #ifdef ARCH_SPARC
3922    dsputil_init_vis(c,avctx);
3923 #endif
3924 #ifdef ARCH_ALPHA
3925     dsputil_init_alpha(c, avctx);
3926 #endif
3927 #ifdef ARCH_POWERPC
3928     dsputil_init_ppc(c, avctx);
3929 #endif
3930 #ifdef HAVE_MMI
3931     dsputil_init_mmi(c, avctx);
3932 #endif
3933 #ifdef ARCH_SH4
3934     dsputil_init_sh4(c,avctx);
3935 #endif
3936
3937     switch(c->idct_permutation_type){
3938     case FF_NO_IDCT_PERM:
3939         for(i=0; i<64; i++)
3940             c->idct_permutation[i]= i;
3941         break;
3942     case FF_LIBMPEG2_IDCT_PERM:
3943         for(i=0; i<64; i++)
3944             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3945         break;
3946     case FF_SIMPLE_IDCT_PERM:
3947         for(i=0; i<64; i++)
3948             c->idct_permutation[i]= simple_mmx_permutation[i];
3949         break;
3950     case FF_TRANSPOSE_IDCT_PERM:
3951         for(i=0; i<64; i++)
3952             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3953         break;
3954     case FF_PARTTRANS_IDCT_PERM:
3955         for(i=0; i<64; i++)
3956             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3957         break;
3958     default:
3959         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3960     }
3961 }
3962