git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  *
  20  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  21  */
  22
  23 /**
  24  * @file dsputil.c
  25  * DSP utils
  26  */
  27
  28 #include "avcodec.h"
  29 #include "dsputil.h"
  30 #include "mpegvideo.h"
  31 #include "simple_idct.h"
  32 #include "faandct.h"
  33
  34 /* snow.c */
  35 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  36
  37 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  38 uint32_t squareTbl[512] = {0, };
  39
  40 const uint8_t ff_zigzag_direct[64] = {
  41     0,   1,  8, 16,  9,  2,  3, 10,
  42     17, 24, 32, 25, 18, 11,  4,  5,
  43     12, 19, 26, 33, 40, 48, 41, 34,
  44     27, 20, 13,  6,  7, 14, 21, 28,
  45     35, 42, 49, 56, 57, 50, 43, 36,
  46     29, 22, 15, 23, 30, 37, 44, 51,
  47     58, 59, 52, 45, 38, 31, 39, 46,
  48     53, 60, 61, 54, 47, 55, 62, 63
  49 };
  50
  51 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  52    specification, we interleave the fields */
  53 const uint8_t ff_zigzag248_direct[64] = {
  54      0,  8,  1,  9, 16, 24,  2, 10,
  55     17, 25, 32, 40, 48, 56, 33, 41,
  56     18, 26,  3, 11,  4, 12, 19, 27,
  57     34, 42, 49, 57, 50, 58, 35, 43,
  58     20, 28,  5, 13,  6, 14, 21, 29,
  59     36, 44, 51, 59, 52, 60, 37, 45,
  60     22, 30,  7, 15, 23, 31, 38, 46,
  61     53, 61, 54, 62, 39, 47, 55, 63,
  62 };
  63
  64 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  65 uint16_t __align8 inv_zigzag_direct16[64] = {0, };
  66
  67 const uint8_t ff_alternate_horizontal_scan[64] = {
  68     0,  1,   2,  3,  8,  9, 16, 17,
  69     10, 11,  4,  5,  6,  7, 15, 14,
  70     13, 12, 19, 18, 24, 25, 32, 33,
  71     26, 27, 20, 21, 22, 23, 28, 29,
  72     30, 31, 34, 35, 40, 41, 48, 49,
  73     42, 43, 36, 37, 38, 39, 44, 45,
  74     46, 47, 50, 51, 56, 57, 58, 59,
  75     52, 53, 54, 55, 60, 61, 62, 63,
  76 };
  77
  78 const uint8_t ff_alternate_vertical_scan[64] = {
  79     0,  8,  16, 24,  1,  9,  2, 10,
  80     17, 25, 32, 40, 48, 56, 57, 49,
  81     41, 33, 26, 18,  3, 11,  4, 12,
  82     19, 27, 34, 42, 50, 58, 35, 43,
  83     51, 59, 20, 28,  5, 13,  6, 14,
  84     21, 29, 36, 44, 52, 60, 37, 45,
  85     53, 61, 22, 30,  7, 15, 23, 31,
  86     38, 46, 54, 62, 39, 47, 55, 63,
  87 };
  88
  89 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  90 const uint32_t inverse[256]={
  91          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  92  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
  93  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
  94  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
  95  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
  96  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
  97   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
  98   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
  99   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 100   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 101   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 102   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 103   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 104   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 105   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 106   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 107   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 108   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 109   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 110   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 111   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 112   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 113   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 114   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 115   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 116   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 117   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 118   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 119   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 120   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 121   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 122   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 123 };
 124
 125 /* Input permutation for the simple_idct_mmx */
 126 static const uint8_t simple_mmx_permutation[64]={
 127         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 128         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 129         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 130         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 131         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 132         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 133         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 134         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 135 };
 136
 137 static int pix_sum_c(uint8_t * pix, int line_size)
 138 {
 139     int s, i, j;
 140
 141     s = 0;
 142     for (i = 0; i < 16; i++) {
 143         for (j = 0; j < 16; j += 8) {
 144             s += pix[0];
 145             s += pix[1];
 146             s += pix[2];
 147             s += pix[3];
 148             s += pix[4];
 149             s += pix[5];
 150             s += pix[6];
 151             s += pix[7];
 152             pix += 8;
 153         }
 154         pix += line_size - 16;
 155     }
 156     return s;
 157 }
 158
 159 static int pix_norm1_c(uint8_t * pix, int line_size)
 160 {
 161     int s, i, j;
 162     uint32_t *sq = squareTbl + 256;
 163
 164     s = 0;
 165     for (i = 0; i < 16; i++) {
 166         for (j = 0; j < 16; j += 8) {
 167 #if 0
 168             s += sq[pix[0]];
 169             s += sq[pix[1]];
 170             s += sq[pix[2]];
 171             s += sq[pix[3]];
 172             s += sq[pix[4]];
 173             s += sq[pix[5]];
 174             s += sq[pix[6]];
 175             s += sq[pix[7]];
 176 #else
 177 #if LONG_MAX > 2147483647
 178             register uint64_t x=*(uint64_t*)pix;
 179             s += sq[x&0xff];
 180             s += sq[(x>>8)&0xff];
 181             s += sq[(x>>16)&0xff];
 182             s += sq[(x>>24)&0xff];
 183             s += sq[(x>>32)&0xff];
 184             s += sq[(x>>40)&0xff];
 185             s += sq[(x>>48)&0xff];
 186             s += sq[(x>>56)&0xff];
 187 #else
 188             register uint32_t x=*(uint32_t*)pix;
 189             s += sq[x&0xff];
 190             s += sq[(x>>8)&0xff];
 191             s += sq[(x>>16)&0xff];
 192             s += sq[(x>>24)&0xff];
 193             x=*(uint32_t*)(pix+4);
 194             s += sq[x&0xff];
 195             s += sq[(x>>8)&0xff];
 196             s += sq[(x>>16)&0xff];
 197             s += sq[(x>>24)&0xff];
 198 #endif
 199 #endif
 200             pix += 8;
 201         }
 202         pix += line_size - 16;
 203     }
 204     return s;
 205 }
 206
 207 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 208     int i;
 209
 210     for(i=0; i+8<=w; i+=8){
 211         dst[i+0]= bswap_32(src[i+0]);
 212         dst[i+1]= bswap_32(src[i+1]);
 213         dst[i+2]= bswap_32(src[i+2]);
 214         dst[i+3]= bswap_32(src[i+3]);
 215         dst[i+4]= bswap_32(src[i+4]);
 216         dst[i+5]= bswap_32(src[i+5]);
 217         dst[i+6]= bswap_32(src[i+6]);
 218         dst[i+7]= bswap_32(src[i+7]);
 219     }
 220     for(;i<w; i++){
 221         dst[i+0]= bswap_32(src[i+0]);
 222     }
 223 }
 224
 225 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 226 {
 227     int s, i;
 228     uint32_t *sq = squareTbl + 256;
 229
 230     s = 0;
 231     for (i = 0; i < h; i++) {
 232         s += sq[pix1[0] - pix2[0]];
 233         s += sq[pix1[1] - pix2[1]];
 234         s += sq[pix1[2] - pix2[2]];
 235         s += sq[pix1[3] - pix2[3]];
 236         pix1 += line_size;
 237         pix2 += line_size;
 238     }
 239     return s;
 240 }
 241
 242 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 243 {
 244     int s, i;
 245     uint32_t *sq = squareTbl + 256;
 246
 247     s = 0;
 248     for (i = 0; i < h; i++) {
 249         s += sq[pix1[0] - pix2[0]];
 250         s += sq[pix1[1] - pix2[1]];
 251         s += sq[pix1[2] - pix2[2]];
 252         s += sq[pix1[3] - pix2[3]];
 253         s += sq[pix1[4] - pix2[4]];
 254         s += sq[pix1[5] - pix2[5]];
 255         s += sq[pix1[6] - pix2[6]];
 256         s += sq[pix1[7] - pix2[7]];
 257         pix1 += line_size;
 258         pix2 += line_size;
 259     }
 260     return s;
 261 }
 262
 263 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 264 {
 265     int s, i;
 266     uint32_t *sq = squareTbl + 256;
 267
 268     s = 0;
 269     for (i = 0; i < h; i++) {
 270         s += sq[pix1[ 0] - pix2[ 0]];
 271         s += sq[pix1[ 1] - pix2[ 1]];
 272         s += sq[pix1[ 2] - pix2[ 2]];
 273         s += sq[pix1[ 3] - pix2[ 3]];
 274         s += sq[pix1[ 4] - pix2[ 4]];
 275         s += sq[pix1[ 5] - pix2[ 5]];
 276         s += sq[pix1[ 6] - pix2[ 6]];
 277         s += sq[pix1[ 7] - pix2[ 7]];
 278         s += sq[pix1[ 8] - pix2[ 8]];
 279         s += sq[pix1[ 9] - pix2[ 9]];
 280         s += sq[pix1[10] - pix2[10]];
 281         s += sq[pix1[11] - pix2[11]];
 282         s += sq[pix1[12] - pix2[12]];
 283         s += sq[pix1[13] - pix2[13]];
 284         s += sq[pix1[14] - pix2[14]];
 285         s += sq[pix1[15] - pix2[15]];
 286
 287         pix1 += line_size;
 288         pix2 += line_size;
 289     }
 290     return s;
 291 }
 292
 293
 294 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 295     int s, i, j;
 296     const int dec_count= w==8 ? 3 : 4;
 297     int tmp[16*16];
 298 #if 0
 299     int level, ori;
 300     static const int scale[2][2][4][4]={
 301       {
 302         {
 303             //8x8 dec=3
 304             {268, 239, 239, 213},
 305             {  0, 224, 224, 152},
 306             {  0, 135, 135, 110},
 307         },{
 308             //16x16 dec=4
 309             {344, 310, 310, 280},
 310             {  0, 320, 320, 228},
 311             {  0, 175, 175, 136},
 312             {  0, 129, 129, 102},
 313         }
 314       },{
 315         {//FIXME 5/3
 316             //8x8 dec=3
 317             {275, 245, 245, 218},
 318             {  0, 230, 230, 156},
 319             {  0, 138, 138, 113},
 320         },{
 321             //16x16 dec=4
 322             {352, 317, 317, 286},
 323             {  0, 328, 328, 233},
 324             {  0, 180, 180, 140},
 325             {  0, 132, 132, 105},
 326         }
 327       }
 328     };
 329 #endif
 330
 331     for (i = 0; i < h; i++) {
 332         for (j = 0; j < w; j+=4) {
 333             tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 334             tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 335             tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 336             tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 337         }
 338         pix1 += line_size;
 339         pix2 += line_size;
 340     }
 341     ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
 342
 343     s=0;
 344 #if 0
 345     for(level=0; level<dec_count; level++){
 346         for(ori= level ? 1 : 0; ori<4; ori++){
 347             int sx= (ori&1) ? 1<<level: 0;
 348             int stride= 16<<(dec_count-level);
 349             int sy= (ori&2) ? stride>>1 : 0;
 350             int size= 1<<level;
 351
 352             for(i=0; i<size; i++){
 353                 for(j=0; j<size; j++){
 354                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 355                     s += ABS(v);
 356                 }
 357             }
 358         }
 359     }
 360 #endif
 361     for (i = 0; i < h; i++) {
 362         for (j = 0; j < w; j+=4) {
 363             s+= ABS(tmp[16*i+j+0]);
 364             s+= ABS(tmp[16*i+j+1]);
 365             s+= ABS(tmp[16*i+j+2]);
 366             s+= ABS(tmp[16*i+j+3]);
 367         }
 368     }
 369     assert(s>=0);
 370
 371     return s>>2;
 372 }
 373
 374 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 375     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 376 }
 377
 378 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 379     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 380 }
 381
 382 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 383     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 384 }
 385
 386 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 387     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 388 }
 389
 390 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 391 {
 392     int i;
 393
 394     /* read the pixels */
 395     for(i=0;i<8;i++) {
 396         block[0] = pixels[0];
 397         block[1] = pixels[1];
 398         block[2] = pixels[2];
 399         block[3] = pixels[3];
 400         block[4] = pixels[4];
 401         block[5] = pixels[5];
 402         block[6] = pixels[6];
 403         block[7] = pixels[7];
 404         pixels += line_size;
 405         block += 8;
 406     }
 407 }
 408
 409 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 410                           const uint8_t *s2, int stride){
 411     int i;
 412
 413     /* read the pixels */
 414     for(i=0;i<8;i++) {
 415         block[0] = s1[0] - s2[0];
 416         block[1] = s1[1] - s2[1];
 417         block[2] = s1[2] - s2[2];
 418         block[3] = s1[3] - s2[3];
 419         block[4] = s1[4] - s2[4];
 420         block[5] = s1[5] - s2[5];
 421         block[6] = s1[6] - s2[6];
 422         block[7] = s1[7] - s2[7];
 423         s1 += stride;
 424         s2 += stride;
 425         block += 8;
 426     }
 427 }
 428
 429
 430 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 431                                  int line_size)
 432 {
 433     int i;
 434     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 435
 436     /* read the pixels */
 437     for(i=0;i<8;i++) {
 438         pixels[0] = cm[block[0]];
 439         pixels[1] = cm[block[1]];
 440         pixels[2] = cm[block[2]];
 441         pixels[3] = cm[block[3]];
 442         pixels[4] = cm[block[4]];
 443         pixels[5] = cm[block[5]];
 444         pixels[6] = cm[block[6]];
 445         pixels[7] = cm[block[7]];
 446
 447         pixels += line_size;
 448         block += 8;
 449     }
 450 }
 451
 452 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 453                                  int line_size)
 454 {
 455     int i;
 456     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 457
 458     /* read the pixels */
 459     for(i=0;i<4;i++) {
 460         pixels[0] = cm[block[0]];
 461         pixels[1] = cm[block[1]];
 462         pixels[2] = cm[block[2]];
 463         pixels[3] = cm[block[3]];
 464
 465         pixels += line_size;
 466         block += 8;
 467     }
 468 }
 469
 470 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 471                                  int line_size)
 472 {
 473     int i;
 474     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 475
 476     /* read the pixels */
 477     for(i=0;i<2;i++) {
 478         pixels[0] = cm[block[0]];
 479         pixels[1] = cm[block[1]];
 480
 481         pixels += line_size;
 482         block += 8;
 483     }
 484 }
 485
 486 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 487                                         uint8_t *restrict pixels,
 488                                         int line_size)
 489 {
 490     int i, j;
 491
 492     for (i = 0; i < 8; i++) {
 493         for (j = 0; j < 8; j++) {
 494             if (*block < -128)
 495                 *pixels = 0;
 496             else if (*block > 127)
 497                 *pixels = 255;
 498             else
 499                 *pixels = (uint8_t)(*block + 128);
 500             block++;
 501             pixels++;
 502         }
 503         pixels += (line_size - 8);
 504     }
 505 }
 506
 507 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 508                           int line_size)
 509 {
 510     int i;
 511     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 512
 513     /* read the pixels */
 514     for(i=0;i<8;i++) {
 515         pixels[0] = cm[pixels[0] + block[0]];
 516         pixels[1] = cm[pixels[1] + block[1]];
 517         pixels[2] = cm[pixels[2] + block[2]];
 518         pixels[3] = cm[pixels[3] + block[3]];
 519         pixels[4] = cm[pixels[4] + block[4]];
 520         pixels[5] = cm[pixels[5] + block[5]];
 521         pixels[6] = cm[pixels[6] + block[6]];
 522         pixels[7] = cm[pixels[7] + block[7]];
 523         pixels += line_size;
 524         block += 8;
 525     }
 526 }
 527
 528 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 529                           int line_size)
 530 {
 531     int i;
 532     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 533
 534     /* read the pixels */
 535     for(i=0;i<4;i++) {
 536         pixels[0] = cm[pixels[0] + block[0]];
 537         pixels[1] = cm[pixels[1] + block[1]];
 538         pixels[2] = cm[pixels[2] + block[2]];
 539         pixels[3] = cm[pixels[3] + block[3]];
 540         pixels += line_size;
 541         block += 8;
 542     }
 543 }
 544
 545 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 546                           int line_size)
 547 {
 548     int i;
 549     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 550
 551     /* read the pixels */
 552     for(i=0;i<2;i++) {
 553         pixels[0] = cm[pixels[0] + block[0]];
 554         pixels[1] = cm[pixels[1] + block[1]];
 555         pixels += line_size;
 556         block += 8;
 557     }
 558 }
 559 #if 0
 560
 561 #define PIXOP2(OPNAME, OP) \
 562 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 563 {\
 564     int i;\
 565     for(i=0; i<h; i++){\
 566         OP(*((uint64_t*)block), LD64(pixels));\
 567         pixels+=line_size;\
 568         block +=line_size;\
 569     }\
 570 }\
 571 \
 572 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 573 {\
 574     int i;\
 575     for(i=0; i<h; i++){\
 576         const uint64_t a= LD64(pixels  );\
 577         const uint64_t b= LD64(pixels+1);\
 578         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 579         pixels+=line_size;\
 580         block +=line_size;\
 581     }\
 582 }\
 583 \
 584 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 585 {\
 586     int i;\
 587     for(i=0; i<h; i++){\
 588         const uint64_t a= LD64(pixels  );\
 589         const uint64_t b= LD64(pixels+1);\
 590         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 591         pixels+=line_size;\
 592         block +=line_size;\
 593     }\
 594 }\
 595 \
 596 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 597 {\
 598     int i;\
 599     for(i=0; i<h; i++){\
 600         const uint64_t a= LD64(pixels          );\
 601         const uint64_t b= LD64(pixels+line_size);\
 602         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 603         pixels+=line_size;\
 604         block +=line_size;\
 605     }\
 606 }\
 607 \
 608 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 609 {\
 610     int i;\
 611     for(i=0; i<h; i++){\
 612         const uint64_t a= LD64(pixels          );\
 613         const uint64_t b= LD64(pixels+line_size);\
 614         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 615         pixels+=line_size;\
 616         block +=line_size;\
 617     }\
 618 }\
 619 \
 620 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 621 {\
 622         int i;\
 623         const uint64_t a= LD64(pixels  );\
 624         const uint64_t b= LD64(pixels+1);\
 625         uint64_t l0=  (a&0x0303030303030303ULL)\
 626                     + (b&0x0303030303030303ULL)\
 627                     + 0x0202020202020202ULL;\
 628         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 629                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 630         uint64_t l1,h1;\
 631 \
 632         pixels+=line_size;\
 633         for(i=0; i<h; i+=2){\
 634             uint64_t a= LD64(pixels  );\
 635             uint64_t b= LD64(pixels+1);\
 636             l1=  (a&0x0303030303030303ULL)\
 637                + (b&0x0303030303030303ULL);\
 638             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 639               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 640             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 641             pixels+=line_size;\
 642             block +=line_size;\
 643             a= LD64(pixels  );\
 644             b= LD64(pixels+1);\
 645             l0=  (a&0x0303030303030303ULL)\
 646                + (b&0x0303030303030303ULL)\
 647                + 0x0202020202020202ULL;\
 648             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 649               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 650             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 651             pixels+=line_size;\
 652             block +=line_size;\
 653         }\
 654 }\
 655 \
 656 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 657 {\
 658         int i;\
 659         const uint64_t a= LD64(pixels  );\
 660         const uint64_t b= LD64(pixels+1);\
 661         uint64_t l0=  (a&0x0303030303030303ULL)\
 662                     + (b&0x0303030303030303ULL)\
 663                     + 0x0101010101010101ULL;\
 664         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 665                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 666         uint64_t l1,h1;\
 667 \
 668         pixels+=line_size;\
 669         for(i=0; i<h; i+=2){\
 670             uint64_t a= LD64(pixels  );\
 671             uint64_t b= LD64(pixels+1);\
 672             l1=  (a&0x0303030303030303ULL)\
 673                + (b&0x0303030303030303ULL);\
 674             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 675               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 676             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 677             pixels+=line_size;\
 678             block +=line_size;\
 679             a= LD64(pixels  );\
 680             b= LD64(pixels+1);\
 681             l0=  (a&0x0303030303030303ULL)\
 682                + (b&0x0303030303030303ULL)\
 683                + 0x0101010101010101ULL;\
 684             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 685               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 686             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 687             pixels+=line_size;\
 688             block +=line_size;\
 689         }\
 690 }\
 691 \
 692 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 693 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 694 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 695 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 696 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 697 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 698 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 699
 700 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 701 #else // 64 bit variant
 702
 703 #define PIXOP2(OPNAME, OP) \
 704 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 705     int i;\
 706     for(i=0; i<h; i++){\
 707         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
 708         pixels+=line_size;\
 709         block +=line_size;\
 710     }\
 711 }\
 712 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 713     int i;\
 714     for(i=0; i<h; i++){\
 715         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 716         pixels+=line_size;\
 717         block +=line_size;\
 718     }\
 719 }\
 720 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 721     int i;\
 722     for(i=0; i<h; i++){\
 723         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 724         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 725         pixels+=line_size;\
 726         block +=line_size;\
 727     }\
 728 }\
 729 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 730     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 731 }\
 732 \
 733 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 734                                                 int src_stride1, int src_stride2, int h){\
 735     int i;\
 736     for(i=0; i<h; i++){\
 737         uint32_t a,b;\
 738         a= LD32(&src1[i*src_stride1  ]);\
 739         b= LD32(&src2[i*src_stride2  ]);\
 740         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 741         a= LD32(&src1[i*src_stride1+4]);\
 742         b= LD32(&src2[i*src_stride2+4]);\
 743         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 744     }\
 745 }\
 746 \
 747 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 748                                                 int src_stride1, int src_stride2, int h){\
 749     int i;\
 750     for(i=0; i<h; i++){\
 751         uint32_t a,b;\
 752         a= LD32(&src1[i*src_stride1  ]);\
 753         b= LD32(&src2[i*src_stride2  ]);\
 754         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 755         a= LD32(&src1[i*src_stride1+4]);\
 756         b= LD32(&src2[i*src_stride2+4]);\
 757         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 758     }\
 759 }\
 760 \
 761 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 762                                                 int src_stride1, int src_stride2, int h){\
 763     int i;\
 764     for(i=0; i<h; i++){\
 765         uint32_t a,b;\
 766         a= LD32(&src1[i*src_stride1  ]);\
 767         b= LD32(&src2[i*src_stride2  ]);\
 768         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 769     }\
 770 }\
 771 \
 772 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 773                                                 int src_stride1, int src_stride2, int h){\
 774     int i;\
 775     for(i=0; i<h; i++){\
 776         uint32_t a,b;\
 777         a= LD16(&src1[i*src_stride1  ]);\
 778         b= LD16(&src2[i*src_stride2  ]);\
 779         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 780     }\
 781 }\
 782 \
 783 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 784                                                 int src_stride1, int src_stride2, int h){\
 785     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 786     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 787 }\
 788 \
 789 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 790                                                 int src_stride1, int src_stride2, int h){\
 791     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 792     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 793 }\
 794 \
 795 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 796     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 797 }\
 798 \
 799 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 800     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 801 }\
 802 \
 803 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 804     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 805 }\
 806 \
 807 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 808     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 809 }\
 810 \
 811 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 812                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 813     int i;\
 814     for(i=0; i<h; i++){\
 815         uint32_t a, b, c, d, l0, l1, h0, h1;\
 816         a= LD32(&src1[i*src_stride1]);\
 817         b= LD32(&src2[i*src_stride2]);\
 818         c= LD32(&src3[i*src_stride3]);\
 819         d= LD32(&src4[i*src_stride4]);\
 820         l0=  (a&0x03030303UL)\
 821            + (b&0x03030303UL)\
 822            + 0x02020202UL;\
 823         h0= ((a&0xFCFCFCFCUL)>>2)\
 824           + ((b&0xFCFCFCFCUL)>>2);\
 825         l1=  (c&0x03030303UL)\
 826            + (d&0x03030303UL);\
 827         h1= ((c&0xFCFCFCFCUL)>>2)\
 828           + ((d&0xFCFCFCFCUL)>>2);\
 829         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 830         a= LD32(&src1[i*src_stride1+4]);\
 831         b= LD32(&src2[i*src_stride2+4]);\
 832         c= LD32(&src3[i*src_stride3+4]);\
 833         d= LD32(&src4[i*src_stride4+4]);\
 834         l0=  (a&0x03030303UL)\
 835            + (b&0x03030303UL)\
 836            + 0x02020202UL;\
 837         h0= ((a&0xFCFCFCFCUL)>>2)\
 838           + ((b&0xFCFCFCFCUL)>>2);\
 839         l1=  (c&0x03030303UL)\
 840            + (d&0x03030303UL);\
 841         h1= ((c&0xFCFCFCFCUL)>>2)\
 842           + ((d&0xFCFCFCFCUL)>>2);\
 843         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 844     }\
 845 }\
 846 \
 847 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 848     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 849 }\
 850 \
 851 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 852     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 853 }\
 854 \
 855 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 856     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 857 }\
 858 \
 859 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 860     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 861 }\
 862 \
 863 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 864                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 865     int i;\
 866     for(i=0; i<h; i++){\
 867         uint32_t a, b, c, d, l0, l1, h0, h1;\
 868         a= LD32(&src1[i*src_stride1]);\
 869         b= LD32(&src2[i*src_stride2]);\
 870         c= LD32(&src3[i*src_stride3]);\
 871         d= LD32(&src4[i*src_stride4]);\
 872         l0=  (a&0x03030303UL)\
 873            + (b&0x03030303UL)\
 874            + 0x01010101UL;\
 875         h0= ((a&0xFCFCFCFCUL)>>2)\
 876           + ((b&0xFCFCFCFCUL)>>2);\
 877         l1=  (c&0x03030303UL)\
 878            + (d&0x03030303UL);\
 879         h1= ((c&0xFCFCFCFCUL)>>2)\
 880           + ((d&0xFCFCFCFCUL)>>2);\
 881         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 882         a= LD32(&src1[i*src_stride1+4]);\
 883         b= LD32(&src2[i*src_stride2+4]);\
 884         c= LD32(&src3[i*src_stride3+4]);\
 885         d= LD32(&src4[i*src_stride4+4]);\
 886         l0=  (a&0x03030303UL)\
 887            + (b&0x03030303UL)\
 888            + 0x01010101UL;\
 889         h0= ((a&0xFCFCFCFCUL)>>2)\
 890           + ((b&0xFCFCFCFCUL)>>2);\
 891         l1=  (c&0x03030303UL)\
 892            + (d&0x03030303UL);\
 893         h1= ((c&0xFCFCFCFCUL)>>2)\
 894           + ((d&0xFCFCFCFCUL)>>2);\
 895         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 896     }\
 897 }\
 898 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 899                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 900     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 901     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 902 }\
 903 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 904                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 905     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 906     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 907 }\
 908 \
 909 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 910 {\
 911         int i, a0, b0, a1, b1;\
 912         a0= pixels[0];\
 913         b0= pixels[1] + 2;\
 914         a0 += b0;\
 915         b0 += pixels[2];\
 916 \
 917         pixels+=line_size;\
 918         for(i=0; i<h; i+=2){\
 919             a1= pixels[0];\
 920             b1= pixels[1];\
 921             a1 += b1;\
 922             b1 += pixels[2];\
 923 \
 924             block[0]= (a1+a0)>>2; /* FIXME non put */\
 925             block[1]= (b1+b0)>>2;\
 926 \
 927             pixels+=line_size;\
 928             block +=line_size;\
 929 \
 930             a0= pixels[0];\
 931             b0= pixels[1] + 2;\
 932             a0 += b0;\
 933             b0 += pixels[2];\
 934 \
 935             block[0]= (a1+a0)>>2;\
 936             block[1]= (b1+b0)>>2;\
 937             pixels+=line_size;\
 938             block +=line_size;\
 939         }\
 940 }\
 941 \
 942 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 943 {\
 944         int i;\
 945         const uint32_t a= LD32(pixels  );\
 946         const uint32_t b= LD32(pixels+1);\
 947         uint32_t l0=  (a&0x03030303UL)\
 948                     + (b&0x03030303UL)\
 949                     + 0x02020202UL;\
 950         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 951                    + ((b&0xFCFCFCFCUL)>>2);\
 952         uint32_t l1,h1;\
 953 \
 954         pixels+=line_size;\
 955         for(i=0; i<h; i+=2){\
 956             uint32_t a= LD32(pixels  );\
 957             uint32_t b= LD32(pixels+1);\
 958             l1=  (a&0x03030303UL)\
 959                + (b&0x03030303UL);\
 960             h1= ((a&0xFCFCFCFCUL)>>2)\
 961               + ((b&0xFCFCFCFCUL)>>2);\
 962             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 963             pixels+=line_size;\
 964             block +=line_size;\
 965             a= LD32(pixels  );\
 966             b= LD32(pixels+1);\
 967             l0=  (a&0x03030303UL)\
 968                + (b&0x03030303UL)\
 969                + 0x02020202UL;\
 970             h0= ((a&0xFCFCFCFCUL)>>2)\
 971               + ((b&0xFCFCFCFCUL)>>2);\
 972             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 973             pixels+=line_size;\
 974             block +=line_size;\
 975         }\
 976 }\
 977 \
 978 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 979 {\
 980     int j;\
 981     for(j=0; j<2; j++){\
 982         int i;\
 983         const uint32_t a= LD32(pixels  );\
 984         const uint32_t b= LD32(pixels+1);\
 985         uint32_t l0=  (a&0x03030303UL)\
 986                     + (b&0x03030303UL)\
 987                     + 0x02020202UL;\
 988         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 989                    + ((b&0xFCFCFCFCUL)>>2);\
 990         uint32_t l1,h1;\
 991 \
 992         pixels+=line_size;\
 993         for(i=0; i<h; i+=2){\
 994             uint32_t a= LD32(pixels  );\
 995             uint32_t b= LD32(pixels+1);\
 996             l1=  (a&0x03030303UL)\
 997                + (b&0x03030303UL);\
 998             h1= ((a&0xFCFCFCFCUL)>>2)\
 999               + ((b&0xFCFCFCFCUL)>>2);\
1000             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1001             pixels+=line_size;\
1002             block +=line_size;\
1003             a= LD32(pixels  );\
1004             b= LD32(pixels+1);\
1005             l0=  (a&0x03030303UL)\
1006                + (b&0x03030303UL)\
1007                + 0x02020202UL;\
1008             h0= ((a&0xFCFCFCFCUL)>>2)\
1009               + ((b&0xFCFCFCFCUL)>>2);\
1010             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1011             pixels+=line_size;\
1012             block +=line_size;\
1013         }\
1014         pixels+=4-line_size*(h+1);\
1015         block +=4-line_size*h;\
1016     }\
1017 }\
1018 \
1019 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1020 {\
1021     int j;\
1022     for(j=0; j<2; j++){\
1023         int i;\
1024         const uint32_t a= LD32(pixels  );\
1025         const uint32_t b= LD32(pixels+1);\
1026         uint32_t l0=  (a&0x03030303UL)\
1027                     + (b&0x03030303UL)\
1028                     + 0x01010101UL;\
1029         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1030                    + ((b&0xFCFCFCFCUL)>>2);\
1031         uint32_t l1,h1;\
1032 \
1033         pixels+=line_size;\
1034         for(i=0; i<h; i+=2){\
1035             uint32_t a= LD32(pixels  );\
1036             uint32_t b= LD32(pixels+1);\
1037             l1=  (a&0x03030303UL)\
1038                + (b&0x03030303UL);\
1039             h1= ((a&0xFCFCFCFCUL)>>2)\
1040               + ((b&0xFCFCFCFCUL)>>2);\
1041             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1042             pixels+=line_size;\
1043             block +=line_size;\
1044             a= LD32(pixels  );\
1045             b= LD32(pixels+1);\
1046             l0=  (a&0x03030303UL)\
1047                + (b&0x03030303UL)\
1048                + 0x01010101UL;\
1049             h0= ((a&0xFCFCFCFCUL)>>2)\
1050               + ((b&0xFCFCFCFCUL)>>2);\
1051             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1052             pixels+=line_size;\
1053             block +=line_size;\
1054         }\
1055         pixels+=4-line_size*(h+1);\
1056         block +=4-line_size*h;\
1057     }\
1058 }\
1059 \
1060 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1061 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1062 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1063 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1064 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1065 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1066 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1067 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1068
1069 #define op_avg(a, b) a = rnd_avg32(a, b)
1070 #endif
1071 #define op_put(a, b) a = b
1072
1073 PIXOP2(avg, op_avg)
1074 PIXOP2(put, op_put)
1075 #undef op_avg
1076 #undef op_put
1077
1078 #define avg2(a,b) ((a+b+1)>>1)
1079 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1080
1081 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1082     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1083 }
1084
1085 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1086     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1087 }
1088
1089 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1090 {
1091     const int A=(16-x16)*(16-y16);
1092     const int B=(   x16)*(16-y16);
1093     const int C=(16-x16)*(   y16);
1094     const int D=(   x16)*(   y16);
1095     int i;
1096
1097     for(i=0; i<h; i++)
1098     {
1099         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1100         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1101         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1102         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1103         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1104         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1105         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1106         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1107         dst+= stride;
1108         src+= stride;
1109     }
1110 }
1111
1112 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1113                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1114 {
1115     int y, vx, vy;
1116     const int s= 1<<shift;
1117
1118     width--;
1119     height--;
1120
1121     for(y=0; y<h; y++){
1122         int x;
1123
1124         vx= ox;
1125         vy= oy;
1126         for(x=0; x<8; x++){ //XXX FIXME optimize
1127             int src_x, src_y, frac_x, frac_y, index;
1128
1129             src_x= vx>>16;
1130             src_y= vy>>16;
1131             frac_x= src_x&(s-1);
1132             frac_y= src_y&(s-1);
1133             src_x>>=shift;
1134             src_y>>=shift;
1135
1136             if((unsigned)src_x < width){
1137                 if((unsigned)src_y < height){
1138                     index= src_x + src_y*stride;
1139                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1140                                            + src[index       +1]*   frac_x )*(s-frac_y)
1141                                         + (  src[index+stride  ]*(s-frac_x)
1142                                            + src[index+stride+1]*   frac_x )*   frac_y
1143                                         + r)>>(shift*2);
1144                 }else{
1145                     index= src_x + clip(src_y, 0, height)*stride;
1146                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1147                                           + src[index       +1]*   frac_x )*s
1148                                         + r)>>(shift*2);
1149                 }
1150             }else{
1151                 if((unsigned)src_y < height){
1152                     index= clip(src_x, 0, width) + src_y*stride;
1153                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1154                                            + src[index+stride  ]*   frac_y )*s
1155                                         + r)>>(shift*2);
1156                 }else{
1157                     index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1158                     dst[y*stride + x]=    src[index         ];
1159                 }
1160             }
1161
1162             vx+= dxx;
1163             vy+= dyx;
1164         }
1165         ox += dxy;
1166         oy += dyy;
1167     }
1168 }
1169
1170 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1171     switch(width){
1172     case 2: put_pixels2_c (dst, src, stride, height); break;
1173     case 4: put_pixels4_c (dst, src, stride, height); break;
1174     case 8: put_pixels8_c (dst, src, stride, height); break;
1175     case 16:put_pixels16_c(dst, src, stride, height); break;
1176     }
1177 }
1178
1179 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1180     int i,j;
1181     for (i=0; i < height; i++) {
1182       for (j=0; j < width; j++) {
1183         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1184       }
1185       src += stride;
1186       dst += stride;
1187     }
1188 }
1189
1190 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1191     int i,j;
1192     for (i=0; i < height; i++) {
1193       for (j=0; j < width; j++) {
1194         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1195       }
1196       src += stride;
1197       dst += stride;
1198     }
1199 }
1200
1201 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1202     int i,j;
1203     for (i=0; i < height; i++) {
1204       for (j=0; j < width; j++) {
1205         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1206       }
1207       src += stride;
1208       dst += stride;
1209     }
1210 }
1211
1212 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1213     int i,j;
1214     for (i=0; i < height; i++) {
1215       for (j=0; j < width; j++) {
1216         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1217       }
1218       src += stride;
1219       dst += stride;
1220     }
1221 }
1222
1223 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1224     int i,j;
1225     for (i=0; i < height; i++) {
1226       for (j=0; j < width; j++) {
1227         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1228       }
1229       src += stride;
1230       dst += stride;
1231     }
1232 }
1233
1234 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1235     int i,j;
1236     for (i=0; i < height; i++) {
1237       for (j=0; j < width; j++) {
1238         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1239       }
1240       src += stride;
1241       dst += stride;
1242     }
1243 }
1244
1245 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1246     int i,j;
1247     for (i=0; i < height; i++) {
1248       for (j=0; j < width; j++) {
1249         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1250       }
1251       src += stride;
1252       dst += stride;
1253     }
1254 }
1255
1256 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1257     int i,j;
1258     for (i=0; i < height; i++) {
1259       for (j=0; j < width; j++) {
1260         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1261       }
1262       src += stride;
1263       dst += stride;
1264     }
1265 }
1266
1267 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1268     switch(width){
1269     case 2: avg_pixels2_c (dst, src, stride, height); break;
1270     case 4: avg_pixels4_c (dst, src, stride, height); break;
1271     case 8: avg_pixels8_c (dst, src, stride, height); break;
1272     case 16:avg_pixels16_c(dst, src, stride, height); break;
1273     }
1274 }
1275
1276 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1277     int i,j;
1278     for (i=0; i < height; i++) {
1279       for (j=0; j < width; j++) {
1280         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1281       }
1282       src += stride;
1283       dst += stride;
1284     }
1285 }
1286
1287 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1288     int i,j;
1289     for (i=0; i < height; i++) {
1290       for (j=0; j < width; j++) {
1291         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1292       }
1293       src += stride;
1294       dst += stride;
1295     }
1296 }
1297
1298 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1299     int i,j;
1300     for (i=0; i < height; i++) {
1301       for (j=0; j < width; j++) {
1302         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1303       }
1304       src += stride;
1305       dst += stride;
1306     }
1307 }
1308
1309 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1310     int i,j;
1311     for (i=0; i < height; i++) {
1312       for (j=0; j < width; j++) {
1313         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1314       }
1315       src += stride;
1316       dst += stride;
1317     }
1318 }
1319
1320 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1321     int i,j;
1322     for (i=0; i < height; i++) {
1323       for (j=0; j < width; j++) {
1324         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1325       }
1326       src += stride;
1327       dst += stride;
1328     }
1329 }
1330
1331 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1332     int i,j;
1333     for (i=0; i < height; i++) {
1334       for (j=0; j < width; j++) {
1335         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1336       }
1337       src += stride;
1338       dst += stride;
1339     }
1340 }
1341
1342 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1343     int i,j;
1344     for (i=0; i < height; i++) {
1345       for (j=0; j < width; j++) {
1346         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1347       }
1348       src += stride;
1349       dst += stride;
1350     }
1351 }
1352
1353 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1354     int i,j;
1355     for (i=0; i < height; i++) {
1356       for (j=0; j < width; j++) {
1357         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1358       }
1359       src += stride;
1360       dst += stride;
1361     }
1362 }
1363 #if 0
1364 #define TPEL_WIDTH(width)\
1365 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1366     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1367 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1368     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1369 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1370     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1371 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1372     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1373 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1374     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1375 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1376     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1377 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1378     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1379 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1380     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1381 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1382     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1383 #endif
1384
1385 #define H264_CHROMA_MC(OPNAME, OP)\
1386 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1387     const int A=(8-x)*(8-y);\
1388     const int B=(  x)*(8-y);\
1389     const int C=(8-x)*(  y);\
1390     const int D=(  x)*(  y);\
1391     int i;\
1392     \
1393     assert(x<8 && y<8 && x>=0 && y>=0);\
1394 \
1395     for(i=0; i<h; i++)\
1396     {\
1397         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1398         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1399         dst+= stride;\
1400         src+= stride;\
1401     }\
1402 }\
1403 \
1404 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1405     const int A=(8-x)*(8-y);\
1406     const int B=(  x)*(8-y);\
1407     const int C=(8-x)*(  y);\
1408     const int D=(  x)*(  y);\
1409     int i;\
1410     \
1411     assert(x<8 && y<8 && x>=0 && y>=0);\
1412 \
1413     for(i=0; i<h; i++)\
1414     {\
1415         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1416         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1417         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1418         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1419         dst+= stride;\
1420         src+= stride;\
1421     }\
1422 }\
1423 \
1424 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1425     const int A=(8-x)*(8-y);\
1426     const int B=(  x)*(8-y);\
1427     const int C=(8-x)*(  y);\
1428     const int D=(  x)*(  y);\
1429     int i;\
1430     \
1431     assert(x<8 && y<8 && x>=0 && y>=0);\
1432 \
1433     for(i=0; i<h; i++)\
1434     {\
1435         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1436         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1437         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1438         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1439         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1440         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1441         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1442         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1443         dst+= stride;\
1444         src+= stride;\
1445     }\
1446 }
1447
1448 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1449 #define op_put(a, b) a = (((b) + 32)>>6)
1450
1451 H264_CHROMA_MC(put_       , op_put)
1452 H264_CHROMA_MC(avg_       , op_avg)
1453 #undef op_avg
1454 #undef op_put
1455
1456 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1457 {
1458     int i;
1459     for(i=0; i<h; i++)
1460     {
1461         ST32(dst   , LD32(src   ));
1462         dst+=dstStride;
1463         src+=srcStride;
1464     }
1465 }
1466
1467 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1468 {
1469     int i;
1470     for(i=0; i<h; i++)
1471     {
1472         ST32(dst   , LD32(src   ));
1473         ST32(dst+4 , LD32(src+4 ));
1474         dst+=dstStride;
1475         src+=srcStride;
1476     }
1477 }
1478
1479 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1480 {
1481     int i;
1482     for(i=0; i<h; i++)
1483     {
1484         ST32(dst   , LD32(src   ));
1485         ST32(dst+4 , LD32(src+4 ));
1486         ST32(dst+8 , LD32(src+8 ));
1487         ST32(dst+12, LD32(src+12));
1488         dst+=dstStride;
1489         src+=srcStride;
1490     }
1491 }
1492
1493 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1494 {
1495     int i;
1496     for(i=0; i<h; i++)
1497     {
1498         ST32(dst   , LD32(src   ));
1499         ST32(dst+4 , LD32(src+4 ));
1500         ST32(dst+8 , LD32(src+8 ));
1501         ST32(dst+12, LD32(src+12));
1502         dst[16]= src[16];
1503         dst+=dstStride;
1504         src+=srcStride;
1505     }
1506 }
1507
1508 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1509 {
1510     int i;
1511     for(i=0; i<h; i++)
1512     {
1513         ST32(dst   , LD32(src   ));
1514         ST32(dst+4 , LD32(src+4 ));
1515         dst[8]= src[8];
1516         dst+=dstStride;
1517         src+=srcStride;
1518     }
1519 }
1520
1521
1522 #define QPEL_MC(r, OPNAME, RND, OP) \
1523 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1524     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1525     int i;\
1526     for(i=0; i<h; i++)\
1527     {\
1528         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1529         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1530         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1531         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1532         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1533         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1534         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1535         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1536         dst+=dstStride;\
1537         src+=srcStride;\
1538     }\
1539 }\
1540 \
1541 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1542     const int w=8;\
1543     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1544     int i;\
1545     for(i=0; i<w; i++)\
1546     {\
1547         const int src0= src[0*srcStride];\
1548         const int src1= src[1*srcStride];\
1549         const int src2= src[2*srcStride];\
1550         const int src3= src[3*srcStride];\
1551         const int src4= src[4*srcStride];\
1552         const int src5= src[5*srcStride];\
1553         const int src6= src[6*srcStride];\
1554         const int src7= src[7*srcStride];\
1555         const int src8= src[8*srcStride];\
1556         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1557         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1558         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1559         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1560         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1561         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1562         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1563         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1564         dst++;\
1565         src++;\
1566     }\
1567 }\
1568 \
1569 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1570     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1571     int i;\
1572     \
1573     for(i=0; i<h; i++)\
1574     {\
1575         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1576         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1577         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1578         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1579         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1580         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1581         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1582         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1583         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1584         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1585         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1586         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1587         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1588         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1589         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1590         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1591         dst+=dstStride;\
1592         src+=srcStride;\
1593     }\
1594 }\
1595 \
1596 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1597     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1598     int i;\
1599     const int w=16;\
1600     for(i=0; i<w; i++)\
1601     {\
1602         const int src0= src[0*srcStride];\
1603         const int src1= src[1*srcStride];\
1604         const int src2= src[2*srcStride];\
1605         const int src3= src[3*srcStride];\
1606         const int src4= src[4*srcStride];\
1607         const int src5= src[5*srcStride];\
1608         const int src6= src[6*srcStride];\
1609         const int src7= src[7*srcStride];\
1610         const int src8= src[8*srcStride];\
1611         const int src9= src[9*srcStride];\
1612         const int src10= src[10*srcStride];\
1613         const int src11= src[11*srcStride];\
1614         const int src12= src[12*srcStride];\
1615         const int src13= src[13*srcStride];\
1616         const int src14= src[14*srcStride];\
1617         const int src15= src[15*srcStride];\
1618         const int src16= src[16*srcStride];\
1619         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1620         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1621         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1622         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1623         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1624         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1625         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1626         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1627         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1628         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1629         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1630         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1631         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1632         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1633         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1634         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1635         dst++;\
1636         src++;\
1637     }\
1638 }\
1639 \
1640 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1641     OPNAME ## pixels8_c(dst, src, stride, 8);\
1642 }\
1643 \
1644 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1645     uint8_t half[64];\
1646     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1647     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1648 }\
1649 \
1650 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1651     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1652 }\
1653 \
1654 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1655     uint8_t half[64];\
1656     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1657     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1658 }\
1659 \
1660 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1661     uint8_t full[16*9];\
1662     uint8_t half[64];\
1663     copy_block9(full, src, 16, stride, 9);\
1664     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1665     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1666 }\
1667 \
1668 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1669     uint8_t full[16*9];\
1670     copy_block9(full, src, 16, stride, 9);\
1671     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1672 }\
1673 \
1674 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1675     uint8_t full[16*9];\
1676     uint8_t half[64];\
1677     copy_block9(full, src, 16, stride, 9);\
1678     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1679     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1680 }\
1681 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1682     uint8_t full[16*9];\
1683     uint8_t halfH[72];\
1684     uint8_t halfV[64];\
1685     uint8_t halfHV[64];\
1686     copy_block9(full, src, 16, stride, 9);\
1687     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1688     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1689     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1690     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1691 }\
1692 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1693     uint8_t full[16*9];\
1694     uint8_t halfH[72];\
1695     uint8_t halfHV[64];\
1696     copy_block9(full, src, 16, stride, 9);\
1697     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1698     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1699     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1700     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1701 }\
1702 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1703     uint8_t full[16*9];\
1704     uint8_t halfH[72];\
1705     uint8_t halfV[64];\
1706     uint8_t halfHV[64];\
1707     copy_block9(full, src, 16, stride, 9);\
1708     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1709     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1710     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1711     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1712 }\
1713 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1714     uint8_t full[16*9];\
1715     uint8_t halfH[72];\
1716     uint8_t halfHV[64];\
1717     copy_block9(full, src, 16, stride, 9);\
1718     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1719     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1720     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1721     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1722 }\
1723 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1724     uint8_t full[16*9];\
1725     uint8_t halfH[72];\
1726     uint8_t halfV[64];\
1727     uint8_t halfHV[64];\
1728     copy_block9(full, src, 16, stride, 9);\
1729     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1730     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1731     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1732     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1733 }\
1734 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1735     uint8_t full[16*9];\
1736     uint8_t halfH[72];\
1737     uint8_t halfHV[64];\
1738     copy_block9(full, src, 16, stride, 9);\
1739     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1740     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1741     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1742     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1743 }\
1744 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1745     uint8_t full[16*9];\
1746     uint8_t halfH[72];\
1747     uint8_t halfV[64];\
1748     uint8_t halfHV[64];\
1749     copy_block9(full, src, 16, stride, 9);\
1750     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1751     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1752     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1753     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1754 }\
1755 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1756     uint8_t full[16*9];\
1757     uint8_t halfH[72];\
1758     uint8_t halfHV[64];\
1759     copy_block9(full, src, 16, stride, 9);\
1760     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1761     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1762     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1763     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1764 }\
1765 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1766     uint8_t halfH[72];\
1767     uint8_t halfHV[64];\
1768     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1769     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1770     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1771 }\
1772 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1773     uint8_t halfH[72];\
1774     uint8_t halfHV[64];\
1775     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1776     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1777     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1778 }\
1779 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1780     uint8_t full[16*9];\
1781     uint8_t halfH[72];\
1782     uint8_t halfV[64];\
1783     uint8_t halfHV[64];\
1784     copy_block9(full, src, 16, stride, 9);\
1785     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1786     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1787     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1788     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1789 }\
1790 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1791     uint8_t full[16*9];\
1792     uint8_t halfH[72];\
1793     copy_block9(full, src, 16, stride, 9);\
1794     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1795     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1796     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1797 }\
1798 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1799     uint8_t full[16*9];\
1800     uint8_t halfH[72];\
1801     uint8_t halfV[64];\
1802     uint8_t halfHV[64];\
1803     copy_block9(full, src, 16, stride, 9);\
1804     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1805     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1806     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1807     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1808 }\
1809 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1810     uint8_t full[16*9];\
1811     uint8_t halfH[72];\
1812     copy_block9(full, src, 16, stride, 9);\
1813     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1814     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1815     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1816 }\
1817 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1818     uint8_t halfH[72];\
1819     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1820     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1821 }\
1822 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1823     OPNAME ## pixels16_c(dst, src, stride, 16);\
1824 }\
1825 \
1826 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1827     uint8_t half[256];\
1828     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1829     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1830 }\
1831 \
1832 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1833     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1834 }\
1835 \
1836 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1837     uint8_t half[256];\
1838     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1839     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1840 }\
1841 \
1842 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1843     uint8_t full[24*17];\
1844     uint8_t half[256];\
1845     copy_block17(full, src, 24, stride, 17);\
1846     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1847     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1848 }\
1849 \
1850 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1851     uint8_t full[24*17];\
1852     copy_block17(full, src, 24, stride, 17);\
1853     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1854 }\
1855 \
1856 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1857     uint8_t full[24*17];\
1858     uint8_t half[256];\
1859     copy_block17(full, src, 24, stride, 17);\
1860     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1861     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1862 }\
1863 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1864     uint8_t full[24*17];\
1865     uint8_t halfH[272];\
1866     uint8_t halfV[256];\
1867     uint8_t halfHV[256];\
1868     copy_block17(full, src, 24, stride, 17);\
1869     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1870     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1871     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1872     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1873 }\
1874 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1875     uint8_t full[24*17];\
1876     uint8_t halfH[272];\
1877     uint8_t halfHV[256];\
1878     copy_block17(full, src, 24, stride, 17);\
1879     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1880     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1881     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1882     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1883 }\
1884 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1885     uint8_t full[24*17];\
1886     uint8_t halfH[272];\
1887     uint8_t halfV[256];\
1888     uint8_t halfHV[256];\
1889     copy_block17(full, src, 24, stride, 17);\
1890     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1891     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1892     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1893     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1894 }\
1895 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1896     uint8_t full[24*17];\
1897     uint8_t halfH[272];\
1898     uint8_t halfHV[256];\
1899     copy_block17(full, src, 24, stride, 17);\
1900     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1901     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1902     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1903     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1904 }\
1905 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1906     uint8_t full[24*17];\
1907     uint8_t halfH[272];\
1908     uint8_t halfV[256];\
1909     uint8_t halfHV[256];\
1910     copy_block17(full, src, 24, stride, 17);\
1911     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1912     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1913     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1914     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1915 }\
1916 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1917     uint8_t full[24*17];\
1918     uint8_t halfH[272];\
1919     uint8_t halfHV[256];\
1920     copy_block17(full, src, 24, stride, 17);\
1921     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1922     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1923     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1924     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1925 }\
1926 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1927     uint8_t full[24*17];\
1928     uint8_t halfH[272];\
1929     uint8_t halfV[256];\
1930     uint8_t halfHV[256];\
1931     copy_block17(full, src, 24, stride, 17);\
1932     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1933     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1934     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1935     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1936 }\
1937 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1938     uint8_t full[24*17];\
1939     uint8_t halfH[272];\
1940     uint8_t halfHV[256];\
1941     copy_block17(full, src, 24, stride, 17);\
1942     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1943     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1944     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1945     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1946 }\
1947 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1948     uint8_t halfH[272];\
1949     uint8_t halfHV[256];\
1950     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1951     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1952     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1953 }\
1954 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1955     uint8_t halfH[272];\
1956     uint8_t halfHV[256];\
1957     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1958     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1959     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1960 }\
1961 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1962     uint8_t full[24*17];\
1963     uint8_t halfH[272];\
1964     uint8_t halfV[256];\
1965     uint8_t halfHV[256];\
1966     copy_block17(full, src, 24, stride, 17);\
1967     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1968     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1969     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1970     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1971 }\
1972 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1973     uint8_t full[24*17];\
1974     uint8_t halfH[272];\
1975     copy_block17(full, src, 24, stride, 17);\
1976     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1977     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1978     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1979 }\
1980 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1981     uint8_t full[24*17];\
1982     uint8_t halfH[272];\
1983     uint8_t halfV[256];\
1984     uint8_t halfHV[256];\
1985     copy_block17(full, src, 24, stride, 17);\
1986     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1987     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1988     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1989     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1990 }\
1991 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1992     uint8_t full[24*17];\
1993     uint8_t halfH[272];\
1994     copy_block17(full, src, 24, stride, 17);\
1995     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1996     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1997     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1998 }\
1999 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2000     uint8_t halfH[272];\
2001     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2002     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2003 }
2004
2005 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2006 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2007 #define op_put(a, b) a = cm[((b) + 16)>>5]
2008 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2009
2010 QPEL_MC(0, put_       , _       , op_put)
2011 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2012 QPEL_MC(0, avg_       , _       , op_avg)
2013 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2014 #undef op_avg
2015 #undef op_avg_no_rnd
2016 #undef op_put
2017 #undef op_put_no_rnd
2018
2019 #if 1
2020 #define H264_LOWPASS(OPNAME, OP, OP2) \
2021 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2022     const int h=4;\
2023     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2024     int i;\
2025     for(i=0; i<h; i++)\
2026     {\
2027         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2028         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2029         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2030         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2031         dst+=dstStride;\
2032         src+=srcStride;\
2033     }\
2034 }\
2035 \
2036 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2037     const int w=4;\
2038     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2039     int i;\
2040     for(i=0; i<w; i++)\
2041     {\
2042         const int srcB= src[-2*srcStride];\
2043         const int srcA= src[-1*srcStride];\
2044         const int src0= src[0 *srcStride];\
2045         const int src1= src[1 *srcStride];\
2046         const int src2= src[2 *srcStride];\
2047         const int src3= src[3 *srcStride];\
2048         const int src4= src[4 *srcStride];\
2049         const int src5= src[5 *srcStride];\
2050         const int src6= src[6 *srcStride];\
2051         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2052         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2053         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2054         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2055         dst++;\
2056         src++;\
2057     }\
2058 }\
2059 \
2060 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2061     const int h=4;\
2062     const int w=4;\
2063     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2064     int i;\
2065     src -= 2*srcStride;\
2066     for(i=0; i<h+5; i++)\
2067     {\
2068         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2069         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2070         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2071         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2072         tmp+=tmpStride;\
2073         src+=srcStride;\
2074     }\
2075     tmp -= tmpStride*(h+5-2);\
2076     for(i=0; i<w; i++)\
2077     {\
2078         const int tmpB= tmp[-2*tmpStride];\
2079         const int tmpA= tmp[-1*tmpStride];\
2080         const int tmp0= tmp[0 *tmpStride];\
2081         const int tmp1= tmp[1 *tmpStride];\
2082         const int tmp2= tmp[2 *tmpStride];\
2083         const int tmp3= tmp[3 *tmpStride];\
2084         const int tmp4= tmp[4 *tmpStride];\
2085         const int tmp5= tmp[5 *tmpStride];\
2086         const int tmp6= tmp[6 *tmpStride];\
2087         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2088         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2089         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2090         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2091         dst++;\
2092         tmp++;\
2093     }\
2094 }\
2095 \
2096 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2097     const int h=8;\
2098     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2099     int i;\
2100     for(i=0; i<h; i++)\
2101     {\
2102         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2103         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2104         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2105         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2106         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2107         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2108         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2109         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2110         dst+=dstStride;\
2111         src+=srcStride;\
2112     }\
2113 }\
2114 \
2115 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2116     const int w=8;\
2117     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2118     int i;\
2119     for(i=0; i<w; i++)\
2120     {\
2121         const int srcB= src[-2*srcStride];\
2122         const int srcA= src[-1*srcStride];\
2123         const int src0= src[0 *srcStride];\
2124         const int src1= src[1 *srcStride];\
2125         const int src2= src[2 *srcStride];\
2126         const int src3= src[3 *srcStride];\
2127         const int src4= src[4 *srcStride];\
2128         const int src5= src[5 *srcStride];\
2129         const int src6= src[6 *srcStride];\
2130         const int src7= src[7 *srcStride];\
2131         const int src8= src[8 *srcStride];\
2132         const int src9= src[9 *srcStride];\
2133         const int src10=src[10*srcStride];\
2134         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2135         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2136         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2137         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2138         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2139         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2140         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2141         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2142         dst++;\
2143         src++;\
2144     }\
2145 }\
2146 \
2147 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2148     const int h=8;\
2149     const int w=8;\
2150     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2151     int i;\
2152     src -= 2*srcStride;\
2153     for(i=0; i<h+5; i++)\
2154     {\
2155         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2156         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2157         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2158         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2159         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2160         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2161         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2162         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2163         tmp+=tmpStride;\
2164         src+=srcStride;\
2165     }\
2166     tmp -= tmpStride*(h+5-2);\
2167     for(i=0; i<w; i++)\
2168     {\
2169         const int tmpB= tmp[-2*tmpStride];\
2170         const int tmpA= tmp[-1*tmpStride];\
2171         const int tmp0= tmp[0 *tmpStride];\
2172         const int tmp1= tmp[1 *tmpStride];\
2173         const int tmp2= tmp[2 *tmpStride];\
2174         const int tmp3= tmp[3 *tmpStride];\
2175         const int tmp4= tmp[4 *tmpStride];\
2176         const int tmp5= tmp[5 *tmpStride];\
2177         const int tmp6= tmp[6 *tmpStride];\
2178         const int tmp7= tmp[7 *tmpStride];\
2179         const int tmp8= tmp[8 *tmpStride];\
2180         const int tmp9= tmp[9 *tmpStride];\
2181         const int tmp10=tmp[10*tmpStride];\
2182         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2183         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2184         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2185         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2186         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2187         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2188         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2189         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2190         dst++;\
2191         tmp++;\
2192     }\
2193 }\
2194 \
2195 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2196     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2197     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2198     src += 8*srcStride;\
2199     dst += 8*dstStride;\
2200     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2201     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2202 }\
2203 \
2204 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2206     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2207     src += 8*srcStride;\
2208     dst += 8*dstStride;\
2209     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2210     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2211 }\
2212 \
2213 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2214     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2215     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2216     src += 8*srcStride;\
2217     dst += 8*dstStride;\
2218     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2219     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2220 }\
2221
2222 #define H264_MC(OPNAME, SIZE) \
2223 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2224     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2225 }\
2226 \
2227 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2228     uint8_t half[SIZE*SIZE];\
2229     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2230     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2231 }\
2232 \
2233 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2234     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2235 }\
2236 \
2237 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2238     uint8_t half[SIZE*SIZE];\
2239     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2240     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2241 }\
2242 \
2243 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2244     uint8_t full[SIZE*(SIZE+5)];\
2245     uint8_t * const full_mid= full + SIZE*2;\
2246     uint8_t half[SIZE*SIZE];\
2247     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2248     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2249     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2250 }\
2251 \
2252 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2253     uint8_t full[SIZE*(SIZE+5)];\
2254     uint8_t * const full_mid= full + SIZE*2;\
2255     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2256     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2257 }\
2258 \
2259 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2260     uint8_t full[SIZE*(SIZE+5)];\
2261     uint8_t * const full_mid= full + SIZE*2;\
2262     uint8_t half[SIZE*SIZE];\
2263     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2264     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2265     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2266 }\
2267 \
2268 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2269     uint8_t full[SIZE*(SIZE+5)];\
2270     uint8_t * const full_mid= full + SIZE*2;\
2271     uint8_t halfH[SIZE*SIZE];\
2272     uint8_t halfV[SIZE*SIZE];\
2273     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2274     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2275     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2276     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2277 }\
2278 \
2279 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2280     uint8_t full[SIZE*(SIZE+5)];\
2281     uint8_t * const full_mid= full + SIZE*2;\
2282     uint8_t halfH[SIZE*SIZE];\
2283     uint8_t halfV[SIZE*SIZE];\
2284     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2285     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2286     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2287     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2288 }\
2289 \
2290 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2291     uint8_t full[SIZE*(SIZE+5)];\
2292     uint8_t * const full_mid= full + SIZE*2;\
2293     uint8_t halfH[SIZE*SIZE];\
2294     uint8_t halfV[SIZE*SIZE];\
2295     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2296     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2297     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2298     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2299 }\
2300 \
2301 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2302     uint8_t full[SIZE*(SIZE+5)];\
2303     uint8_t * const full_mid= full + SIZE*2;\
2304     uint8_t halfH[SIZE*SIZE];\
2305     uint8_t halfV[SIZE*SIZE];\
2306     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2307     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2308     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2309     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2310 }\
2311 \
2312 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2313     int16_t tmp[SIZE*(SIZE+5)];\
2314     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2315 }\
2316 \
2317 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2318     int16_t tmp[SIZE*(SIZE+5)];\
2319     uint8_t halfH[SIZE*SIZE];\
2320     uint8_t halfHV[SIZE*SIZE];\
2321     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2322     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2323     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2324 }\
2325 \
2326 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2327     int16_t tmp[SIZE*(SIZE+5)];\
2328     uint8_t halfH[SIZE*SIZE];\
2329     uint8_t halfHV[SIZE*SIZE];\
2330     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2331     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2332     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2333 }\
2334 \
2335 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2336     uint8_t full[SIZE*(SIZE+5)];\
2337     uint8_t * const full_mid= full + SIZE*2;\
2338     int16_t tmp[SIZE*(SIZE+5)];\
2339     uint8_t halfV[SIZE*SIZE];\
2340     uint8_t halfHV[SIZE*SIZE];\
2341     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2342     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2343     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2344     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2345 }\
2346 \
2347 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2348     uint8_t full[SIZE*(SIZE+5)];\
2349     uint8_t * const full_mid= full + SIZE*2;\
2350     int16_t tmp[SIZE*(SIZE+5)];\
2351     uint8_t halfV[SIZE*SIZE];\
2352     uint8_t halfHV[SIZE*SIZE];\
2353     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2354     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2355     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2356     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2357 }\
2358
2359 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2360 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2361 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2362 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2363 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2364
2365 H264_LOWPASS(put_       , op_put, op2_put)
2366 H264_LOWPASS(avg_       , op_avg, op2_avg)
2367 H264_MC(put_, 4)
2368 H264_MC(put_, 8)
2369 H264_MC(put_, 16)
2370 H264_MC(avg_, 4)
2371 H264_MC(avg_, 8)
2372 H264_MC(avg_, 16)
2373
2374 #undef op_avg
2375 #undef op_put
2376 #undef op2_avg
2377 #undef op2_put
2378 #endif
2379
2380 #define op_scale1(x)  block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2381 #define op_scale2(x)  dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2382 #define H264_WEIGHT(W,H) \
2383 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2384     int attribute_unused x, y; \
2385     offset <<= log2_denom; \
2386     if(log2_denom) offset += 1<<(log2_denom-1); \
2387     for(y=0; y<H; y++, block += stride){ \
2388         op_scale1(0); \
2389         op_scale1(1); \
2390         if(W==2) continue; \
2391         op_scale1(2); \
2392         op_scale1(3); \
2393         if(W==4) continue; \
2394         op_scale1(4); \
2395         op_scale1(5); \
2396         op_scale1(6); \
2397         op_scale1(7); \
2398         if(W==8) continue; \
2399         op_scale1(8); \
2400         op_scale1(9); \
2401         op_scale1(10); \
2402         op_scale1(11); \
2403         op_scale1(12); \
2404         op_scale1(13); \
2405         op_scale1(14); \
2406         op_scale1(15); \
2407     } \
2408 } \
2409 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
2410     int attribute_unused x, y; \
2411     int offset = (offsets + offsetd + 1) >> 1; \
2412     offset = ((offset << 1) + 1) << log2_denom; \
2413     for(y=0; y<H; y++, dst += stride, src += stride){ \
2414         op_scale2(0); \
2415         op_scale2(1); \
2416         if(W==2) continue; \
2417         op_scale2(2); \
2418         op_scale2(3); \
2419         if(W==4) continue; \
2420         op_scale2(4); \
2421         op_scale2(5); \
2422         op_scale2(6); \
2423         op_scale2(7); \
2424         if(W==8) continue; \
2425         op_scale2(8); \
2426         op_scale2(9); \
2427         op_scale2(10); \
2428         op_scale2(11); \
2429         op_scale2(12); \
2430         op_scale2(13); \
2431         op_scale2(14); \
2432         op_scale2(15); \
2433     } \
2434 }
2435
2436 H264_WEIGHT(16,16)
2437 H264_WEIGHT(16,8)
2438 H264_WEIGHT(8,16)
2439 H264_WEIGHT(8,8)
2440 H264_WEIGHT(8,4)
2441 H264_WEIGHT(4,8)
2442 H264_WEIGHT(4,4)
2443 H264_WEIGHT(4,2)
2444 H264_WEIGHT(2,4)
2445 H264_WEIGHT(2,2)
2446
2447 #undef op_scale1
2448 #undef op_scale2
2449 #undef H264_WEIGHT
2450
2451 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2452     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2453     int i;
2454
2455     for(i=0; i<h; i++){
2456         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2457         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2458         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2459         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2460         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2461         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2462         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2463         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2464         dst+=dstStride;
2465         src+=srcStride;
2466     }
2467 }
2468
2469 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2470     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2471     int i;
2472
2473     for(i=0; i<w; i++){
2474         const int src_1= src[ -srcStride];
2475         const int src0 = src[0          ];
2476         const int src1 = src[  srcStride];
2477         const int src2 = src[2*srcStride];
2478         const int src3 = src[3*srcStride];
2479         const int src4 = src[4*srcStride];
2480         const int src5 = src[5*srcStride];
2481         const int src6 = src[6*srcStride];
2482         const int src7 = src[7*srcStride];
2483         const int src8 = src[8*srcStride];
2484         const int src9 = src[9*srcStride];
2485         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2486         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2487         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2488         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2489         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2490         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2491         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2492         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2493         src++;
2494         dst++;
2495     }
2496 }
2497
2498 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2499     put_pixels8_c(dst, src, stride, 8);
2500 }
2501
2502 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2503     uint8_t half[64];
2504     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2505     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2506 }
2507
2508 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2509     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2510 }
2511
2512 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2513     uint8_t half[64];
2514     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2515     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2516 }
2517
2518 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2519     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2520 }
2521
2522 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2523     uint8_t halfH[88];
2524     uint8_t halfV[64];
2525     uint8_t halfHV[64];
2526     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2527     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2528     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2529     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2530 }
2531 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2532     uint8_t halfH[88];
2533     uint8_t halfV[64];
2534     uint8_t halfHV[64];
2535     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2536     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2537     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2538     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2539 }
2540 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2541     uint8_t halfH[88];
2542     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2543     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2544 }
2545
2546 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2547     int x;
2548     const int strength= ff_h263_loop_filter_strength[qscale];
2549
2550     for(x=0; x<8; x++){
2551         int d1, d2, ad1;
2552         int p0= src[x-2*stride];
2553         int p1= src[x-1*stride];
2554         int p2= src[x+0*stride];
2555         int p3= src[x+1*stride];
2556         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2557
2558         if     (d<-2*strength) d1= 0;
2559         else if(d<-  strength) d1=-2*strength - d;
2560         else if(d<   strength) d1= d;
2561         else if(d< 2*strength) d1= 2*strength - d;
2562         else                   d1= 0;
2563
2564         p1 += d1;
2565         p2 -= d1;
2566         if(p1&256) p1= ~(p1>>31);
2567         if(p2&256) p2= ~(p2>>31);
2568
2569         src[x-1*stride] = p1;
2570         src[x+0*stride] = p2;
2571
2572         ad1= ABS(d1)>>1;
2573
2574         d2= clip((p0-p3)/4, -ad1, ad1);
2575
2576         src[x-2*stride] = p0 - d2;
2577         src[x+  stride] = p3 + d2;
2578     }
2579 }
2580
2581 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2582     int y;
2583     const int strength= ff_h263_loop_filter_strength[qscale];
2584
2585     for(y=0; y<8; y++){
2586         int d1, d2, ad1;
2587         int p0= src[y*stride-2];
2588         int p1= src[y*stride-1];
2589         int p2= src[y*stride+0];
2590         int p3= src[y*stride+1];
2591         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2592
2593         if     (d<-2*strength) d1= 0;
2594         else if(d<-  strength) d1=-2*strength - d;
2595         else if(d<   strength) d1= d;
2596         else if(d< 2*strength) d1= 2*strength - d;
2597         else                   d1= 0;
2598
2599         p1 += d1;
2600         p2 -= d1;
2601         if(p1&256) p1= ~(p1>>31);
2602         if(p2&256) p2= ~(p2>>31);
2603
2604         src[y*stride-1] = p1;
2605         src[y*stride+0] = p2;
2606
2607         ad1= ABS(d1)>>1;
2608
2609         d2= clip((p0-p3)/4, -ad1, ad1);
2610
2611         src[y*stride-2] = p0 - d2;
2612         src[y*stride+1] = p3 + d2;
2613     }
2614 }
2615
2616 static void h261_loop_filter_c(uint8_t *src, int stride){
2617     int x,y,xy,yz;
2618     int temp[64];
2619
2620     for(x=0; x<8; x++){
2621         temp[x      ] = 4*src[x           ];
2622         temp[x + 7*8] = 4*src[x + 7*stride];
2623     }
2624     for(y=1; y<7; y++){
2625         for(x=0; x<8; x++){
2626             xy = y * stride + x;
2627             yz = y * 8 + x;
2628             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2629         }
2630     }
2631
2632     for(y=0; y<8; y++){
2633         src[  y*stride] = (temp[  y*8] + 2)>>2;
2634         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2635         for(x=1; x<7; x++){
2636             xy = y * stride + x;
2637             yz = y * 8 + x;
2638             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2639         }
2640     }
2641 }
2642
2643 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2644 {
2645     int s, i;
2646
2647     s = 0;
2648     for(i=0;i<h;i++) {
2649         s += abs(pix1[0] - pix2[0]);
2650         s += abs(pix1[1] - pix2[1]);
2651         s += abs(pix1[2] - pix2[2]);
2652         s += abs(pix1[3] - pix2[3]);
2653         s += abs(pix1[4] - pix2[4]);
2654         s += abs(pix1[5] - pix2[5]);
2655         s += abs(pix1[6] - pix2[6]);
2656         s += abs(pix1[7] - pix2[7]);
2657         s += abs(pix1[8] - pix2[8]);
2658         s += abs(pix1[9] - pix2[9]);
2659         s += abs(pix1[10] - pix2[10]);
2660         s += abs(pix1[11] - pix2[11]);
2661         s += abs(pix1[12] - pix2[12]);
2662         s += abs(pix1[13] - pix2[13]);
2663         s += abs(pix1[14] - pix2[14]);
2664         s += abs(pix1[15] - pix2[15]);
2665         pix1 += line_size;
2666         pix2 += line_size;
2667     }
2668     return s;
2669 }
2670
2671 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2672 {
2673     int s, i;
2674
2675     s = 0;
2676     for(i=0;i<h;i++) {
2677         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2678         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2679         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2680         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2681         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2682         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2683         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2684         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2685         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2686         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2687         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2688         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2689         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2690         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2691         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2692         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2693         pix1 += line_size;
2694         pix2 += line_size;
2695     }
2696     return s;
2697 }
2698
2699 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2700 {
2701     int s, i;
2702     uint8_t *pix3 = pix2 + line_size;
2703
2704     s = 0;
2705     for(i=0;i<h;i++) {
2706         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2707         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2708         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2709         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2710         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2711         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2712         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2713         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2714         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2715         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2716         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2717         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2718         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2719         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2720         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2721         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2722         pix1 += line_size;
2723         pix2 += line_size;
2724         pix3 += line_size;
2725     }
2726     return s;
2727 }
2728
2729 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2730 {
2731     int s, i;
2732     uint8_t *pix3 = pix2 + line_size;
2733
2734     s = 0;
2735     for(i=0;i<h;i++) {
2736         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2737         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2738         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2739         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2740         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2741         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2742         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2743         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2744         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2745         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2746         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2747         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2748         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2749         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2750         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2751         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2752         pix1 += line_size;
2753         pix2 += line_size;
2754         pix3 += line_size;
2755     }
2756     return s;
2757 }
2758
2759 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2760 {
2761     int s, i;
2762
2763     s = 0;
2764     for(i=0;i<h;i++) {
2765         s += abs(pix1[0] - pix2[0]);
2766         s += abs(pix1[1] - pix2[1]);
2767         s += abs(pix1[2] - pix2[2]);
2768         s += abs(pix1[3] - pix2[3]);
2769         s += abs(pix1[4] - pix2[4]);
2770         s += abs(pix1[5] - pix2[5]);
2771         s += abs(pix1[6] - pix2[6]);
2772         s += abs(pix1[7] - pix2[7]);
2773         pix1 += line_size;
2774         pix2 += line_size;
2775     }
2776     return s;
2777 }
2778
2779 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2780 {
2781     int s, i;
2782
2783     s = 0;
2784     for(i=0;i<h;i++) {
2785         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2786         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2787         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2788         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2789         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2790         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2791         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2792         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2793         pix1 += line_size;
2794         pix2 += line_size;
2795     }
2796     return s;
2797 }
2798
2799 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2800 {
2801     int s, i;
2802     uint8_t *pix3 = pix2 + line_size;
2803
2804     s = 0;
2805     for(i=0;i<h;i++) {
2806         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2807         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2808         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2809         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2810         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2811         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2812         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2813         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2814         pix1 += line_size;
2815         pix2 += line_size;
2816         pix3 += line_size;
2817     }
2818     return s;
2819 }
2820
2821 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2822 {
2823     int s, i;
2824     uint8_t *pix3 = pix2 + line_size;
2825
2826     s = 0;
2827     for(i=0;i<h;i++) {
2828         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2829         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2830         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2831         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2832         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2833         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2834         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2835         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2836         pix1 += line_size;
2837         pix2 += line_size;
2838         pix3 += line_size;
2839     }
2840     return s;
2841 }
2842
2843 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2844     int score1=0;
2845     int score2=0;
2846     int x,y;
2847
2848     for(y=0; y<h; y++){
2849         for(x=0; x<16; x++){
2850             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2851         }
2852         if(y+1<h){
2853             for(x=0; x<15; x++){
2854                 score2+= ABS(  s1[x  ] - s1[x  +stride]
2855                              - s1[x+1] + s1[x+1+stride])
2856                         -ABS(  s2[x  ] - s2[x  +stride]
2857                              - s2[x+1] + s2[x+1+stride]);
2858             }
2859         }
2860         s1+= stride;
2861         s2+= stride;
2862     }
2863
2864     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2865     else  return score1 + ABS(score2)*8;
2866 }
2867
2868 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2869     int score1=0;
2870     int score2=0;
2871     int x,y;
2872
2873     for(y=0; y<h; y++){
2874         for(x=0; x<8; x++){
2875             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2876         }
2877         if(y+1<h){
2878             for(x=0; x<7; x++){
2879                 score2+= ABS(  s1[x  ] - s1[x  +stride]
2880                              - s1[x+1] + s1[x+1+stride])
2881                         -ABS(  s2[x  ] - s2[x  +stride]
2882                              - s2[x+1] + s2[x+1+stride]);
2883             }
2884         }
2885         s1+= stride;
2886         s2+= stride;
2887     }
2888
2889     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2890     else  return score1 + ABS(score2)*8;
2891 }
2892
2893 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2894     int i;
2895     unsigned int sum=0;
2896
2897     for(i=0; i<8*8; i++){
2898         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2899         int w= weight[i];
2900         b>>= RECON_SHIFT;
2901         assert(-512<b && b<512);
2902
2903         sum += (w*b)*(w*b)>>4;
2904     }
2905     return sum>>2;
2906 }
2907
2908 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2909     int i;
2910
2911     for(i=0; i<8*8; i++){
2912         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2913     }
2914 }
2915
2916 /**
2917  * permutes an 8x8 block.
2918  * @param block the block which will be permuted according to the given permutation vector
2919  * @param permutation the permutation vector
2920  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2921  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2922  *                  (inverse) permutated to scantable order!
2923  */
2924 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2925 {
2926     int i;
2927     DCTELEM temp[64];
2928
2929     if(last<=0) return;
2930     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2931
2932     for(i=0; i<=last; i++){
2933         const int j= scantable[i];
2934         temp[j]= block[j];
2935         block[j]=0;
2936     }
2937
2938     for(i=0; i<=last; i++){
2939         const int j= scantable[i];
2940         const int perm_j= permutation[j];
2941         block[perm_j]= temp[j];
2942     }
2943 }
2944
2945 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2946     return 0;
2947 }
2948
2949 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2950     int i;
2951
2952     memset(cmp, 0, sizeof(void*)*5);
2953
2954     for(i=0; i<5; i++){
2955         switch(type&0xFF){
2956         case FF_CMP_SAD:
2957             cmp[i]= c->sad[i];
2958             break;
2959         case FF_CMP_SATD:
2960             cmp[i]= c->hadamard8_diff[i];
2961             break;
2962         case FF_CMP_SSE:
2963             cmp[i]= c->sse[i];
2964             break;
2965         case FF_CMP_DCT:
2966             cmp[i]= c->dct_sad[i];
2967             break;
2968         case FF_CMP_DCTMAX:
2969             cmp[i]= c->dct_max[i];
2970             break;
2971         case FF_CMP_PSNR:
2972             cmp[i]= c->quant_psnr[i];
2973             break;
2974         case FF_CMP_BIT:
2975             cmp[i]= c->bit[i];
2976             break;
2977         case FF_CMP_RD:
2978             cmp[i]= c->rd[i];
2979             break;
2980         case FF_CMP_VSAD:
2981             cmp[i]= c->vsad[i];
2982             break;
2983         case FF_CMP_VSSE:
2984             cmp[i]= c->vsse[i];
2985             break;
2986         case FF_CMP_ZERO:
2987             cmp[i]= zero_cmp;
2988             break;
2989         case FF_CMP_NSSE:
2990             cmp[i]= c->nsse[i];
2991             break;
2992         case FF_CMP_W53:
2993             cmp[i]= c->w53[i];
2994             break;
2995         case FF_CMP_W97:
2996             cmp[i]= c->w97[i];
2997             break;
2998         default:
2999             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3000         }
3001     }
3002 }
3003
3004 /**
3005  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3006  */
3007 static void clear_blocks_c(DCTELEM *blocks)
3008 {
3009     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3010 }
3011
3012 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3013     int i;
3014     for(i=0; i+7<w; i+=8){
3015         dst[i+0] += src[i+0];
3016         dst[i+1] += src[i+1];
3017         dst[i+2] += src[i+2];
3018         dst[i+3] += src[i+3];
3019         dst[i+4] += src[i+4];
3020         dst[i+5] += src[i+5];
3021         dst[i+6] += src[i+6];
3022         dst[i+7] += src[i+7];
3023     }
3024     for(; i<w; i++)
3025         dst[i+0] += src[i+0];
3026 }
3027
3028 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3029     int i;
3030     for(i=0; i+7<w; i+=8){
3031         dst[i+0] = src1[i+0]-src2[i+0];
3032         dst[i+1] = src1[i+1]-src2[i+1];
3033         dst[i+2] = src1[i+2]-src2[i+2];
3034         dst[i+3] = src1[i+3]-src2[i+3];
3035         dst[i+4] = src1[i+4]-src2[i+4];
3036         dst[i+5] = src1[i+5]-src2[i+5];
3037         dst[i+6] = src1[i+6]-src2[i+6];
3038         dst[i+7] = src1[i+7]-src2[i+7];
3039     }
3040     for(; i<w; i++)
3041         dst[i+0] = src1[i+0]-src2[i+0];
3042 }
3043
3044 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3045     int i;
3046     uint8_t l, lt;
3047
3048     l= *left;
3049     lt= *left_top;
3050
3051     for(i=0; i<w; i++){
3052         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3053         lt= src1[i];
3054         l= src2[i];
3055         dst[i]= l - pred;
3056     }
3057
3058     *left= l;
3059     *left_top= lt;
3060 }
3061
3062 #define BUTTERFLY2(o1,o2,i1,i2) \
3063 o1= (i1)+(i2);\
3064 o2= (i1)-(i2);
3065
3066 #define BUTTERFLY1(x,y) \
3067 {\
3068     int a,b;\
3069     a= x;\
3070     b= y;\
3071     x= a+b;\
3072     y= a-b;\
3073 }
3074
3075 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3076
3077 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3078     int i;
3079     int temp[64];
3080     int sum=0;
3081
3082     assert(h==8);
3083
3084     for(i=0; i<8; i++){
3085         //FIXME try pointer walks
3086         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3087         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3088         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3089         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3090
3091         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3092         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3093         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3094         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3095
3096         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3097         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3098         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3099         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3100     }
3101
3102     for(i=0; i<8; i++){
3103         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3104         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3105         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3106         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3107
3108         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3109         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3110         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3111         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3112
3113         sum +=
3114              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3115             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3116             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3117             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3118     }
3119 #if 0
3120 static int maxi=0;
3121 if(sum>maxi){
3122     maxi=sum;
3123     printf("MAX:%d\n", maxi);
3124 }
3125 #endif
3126     return sum;
3127 }
3128
3129 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3130     int i;
3131     int temp[64];
3132     int sum=0;
3133
3134     assert(h==8);
3135
3136     for(i=0; i<8; i++){
3137         //FIXME try pointer walks
3138         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3139         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3140         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3141         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3142
3143         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3144         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3145         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3146         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3147
3148         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3149         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3150         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3151         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3152     }
3153
3154     for(i=0; i<8; i++){
3155         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3156         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3157         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3158         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3159
3160         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3161         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3162         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3163         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3164
3165         sum +=
3166              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3167             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3168             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3169             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3170     }
3171
3172     sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3173
3174     return sum;
3175 }
3176
3177 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3178     MpegEncContext * const s= (MpegEncContext *)c;
3179     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3180     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3181     int sum=0, i;
3182
3183     assert(h==8);
3184
3185     s->dsp.diff_pixels(temp, src1, src2, stride);
3186     s->dsp.fdct(temp);
3187
3188     for(i=0; i<64; i++)
3189         sum+= ABS(temp[i]);
3190
3191     return sum;
3192 }
3193
3194 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3195     MpegEncContext * const s= (MpegEncContext *)c;
3196     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3197     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3198     int sum=0, i;
3199
3200     assert(h==8);
3201
3202     s->dsp.diff_pixels(temp, src1, src2, stride);
3203     s->dsp.fdct(temp);
3204
3205     for(i=0; i<64; i++)
3206         sum= FFMAX(sum, ABS(temp[i]));
3207
3208     return sum;
3209 }
3210
3211 void simple_idct(DCTELEM *block); //FIXME
3212
3213 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3214     MpegEncContext * const s= (MpegEncContext *)c;
3215     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3216     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3217     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3218     int sum=0, i;
3219
3220     assert(h==8);
3221     s->mb_intra=0;
3222
3223     s->dsp.diff_pixels(temp, src1, src2, stride);
3224
3225     memcpy(bak, temp, 64*sizeof(DCTELEM));
3226
3227     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3228     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3229     simple_idct(temp); //FIXME
3230
3231     for(i=0; i<64; i++)
3232         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3233
3234     return sum;
3235 }
3236
3237 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3238     MpegEncContext * const s= (MpegEncContext *)c;
3239     const uint8_t *scantable= s->intra_scantable.permutated;
3240     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3241     uint64_t __align8 aligned_bak[stride];
3242     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3243     uint8_t * const bak= (uint8_t*)aligned_bak;
3244     int i, last, run, bits, level, distoration, start_i;
3245     const int esc_length= s->ac_esc_length;
3246     uint8_t * length;
3247     uint8_t * last_length;
3248
3249     assert(h==8);
3250
3251     for(i=0; i<8; i++){
3252         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3253         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3254     }
3255
3256     s->dsp.diff_pixels(temp, src1, src2, stride);
3257
3258     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3259
3260     bits=0;
3261
3262     if (s->mb_intra) {
3263         start_i = 1;
3264         length     = s->intra_ac_vlc_length;
3265         last_length= s->intra_ac_vlc_last_length;
3266         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3267     } else {
3268         start_i = 0;
3269         length     = s->inter_ac_vlc_length;
3270         last_length= s->inter_ac_vlc_last_length;
3271     }
3272
3273     if(last>=start_i){
3274         run=0;
3275         for(i=start_i; i<last; i++){
3276             int j= scantable[i];
3277             level= temp[j];
3278
3279             if(level){
3280                 level+=64;
3281                 if((level&(~127)) == 0){
3282                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3283                 }else
3284                     bits+= esc_length;
3285                 run=0;
3286             }else
3287                 run++;
3288         }
3289         i= scantable[last];
3290
3291         level= temp[i] + 64;
3292
3293         assert(level - 64);
3294
3295         if((level&(~127)) == 0){
3296             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3297         }else
3298             bits+= esc_length;
3299
3300     }
3301
3302     if(last>=0){
3303         if(s->mb_intra)
3304             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3305         else
3306             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3307     }
3308
3309     s->dsp.idct_add(bak, stride, temp);
3310
3311     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3312
3313     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3314 }
3315
3316 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3317     MpegEncContext * const s= (MpegEncContext *)c;
3318     const uint8_t *scantable= s->intra_scantable.permutated;
3319     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3320     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3321     int i, last, run, bits, level, start_i;
3322     const int esc_length= s->ac_esc_length;
3323     uint8_t * length;
3324     uint8_t * last_length;
3325
3326     assert(h==8);
3327
3328     s->dsp.diff_pixels(temp, src1, src2, stride);
3329
3330     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3331
3332     bits=0;
3333
3334     if (s->mb_intra) {
3335         start_i = 1;
3336         length     = s->intra_ac_vlc_length;
3337         last_length= s->intra_ac_vlc_last_length;
3338         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3339     } else {
3340         start_i = 0;
3341         length     = s->inter_ac_vlc_length;
3342         last_length= s->inter_ac_vlc_last_length;
3343     }
3344
3345     if(last>=start_i){
3346         run=0;
3347         for(i=start_i; i<last; i++){
3348             int j= scantable[i];
3349             level= temp[j];
3350
3351             if(level){
3352                 level+=64;
3353                 if((level&(~127)) == 0){
3354                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3355                 }else
3356                     bits+= esc_length;
3357                 run=0;
3358             }else
3359                 run++;
3360         }
3361         i= scantable[last];
3362
3363         level= temp[i] + 64;
3364
3365         assert(level - 64);
3366
3367         if((level&(~127)) == 0){
3368             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3369         }else
3370             bits+= esc_length;
3371     }
3372
3373     return bits;
3374 }
3375
3376 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3377     int score=0;
3378     int x,y;
3379
3380     for(y=1; y<h; y++){
3381         for(x=0; x<16; x+=4){
3382             score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride])
3383                    +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3384         }
3385         s+= stride;
3386     }
3387
3388     return score;
3389 }
3390
3391 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3392     int score=0;
3393     int x,y;
3394
3395     for(y=1; y<h; y++){
3396         for(x=0; x<16; x++){
3397             score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3398         }
3399         s1+= stride;
3400         s2+= stride;
3401     }
3402
3403     return score;
3404 }
3405
3406 #define SQ(a) ((a)*(a))
3407 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3408     int score=0;
3409     int x,y;
3410
3411     for(y=1; y<h; y++){
3412         for(x=0; x<16; x+=4){
3413             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3414                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3415         }
3416         s+= stride;
3417     }
3418
3419     return score;
3420 }
3421
3422 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3423     int score=0;
3424     int x,y;
3425
3426     for(y=1; y<h; y++){
3427         for(x=0; x<16; x++){
3428             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3429         }
3430         s1+= stride;
3431         s2+= stride;
3432     }
3433
3434     return score;
3435 }
3436
3437 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3438 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3439 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3440 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3441 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3442 WARPER8_16_SQ(rd8x8_c, rd16_c)
3443 WARPER8_16_SQ(bit8x8_c, bit16_c)
3444
3445 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3446  converted */
3447 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3448 {
3449     j_rev_dct (block);
3450     put_pixels_clamped_c(block, dest, line_size);
3451 }
3452 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3453 {
3454     j_rev_dct (block);
3455     add_pixels_clamped_c(block, dest, line_size);
3456 }
3457
3458 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3459 {
3460     j_rev_dct4 (block);
3461     put_pixels_clamped4_c(block, dest, line_size);
3462 }
3463 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3464 {
3465     j_rev_dct4 (block);
3466     add_pixels_clamped4_c(block, dest, line_size);
3467 }
3468
3469 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3470 {
3471     j_rev_dct2 (block);
3472     put_pixels_clamped2_c(block, dest, line_size);
3473 }
3474 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3475 {
3476     j_rev_dct2 (block);
3477     add_pixels_clamped2_c(block, dest, line_size);
3478 }
3479
3480 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3481 {
3482     uint8_t *cm = cropTbl + MAX_NEG_CROP;
3483
3484     dest[0] = cm[(block[0] + 4)>>3];
3485 }
3486 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3487 {
3488     uint8_t *cm = cropTbl + MAX_NEG_CROP;
3489
3490     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3491 }
3492
3493 /* init static data */
3494 void dsputil_static_init(void)
3495 {
3496     int i;
3497
3498     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3499     for(i=0;i<MAX_NEG_CROP;i++) {
3500         cropTbl[i] = 0;
3501         cropTbl[i + MAX_NEG_CROP + 256] = 255;
3502     }
3503
3504     for(i=0;i<512;i++) {
3505         squareTbl[i] = (i - 256) * (i - 256);
3506     }
3507
3508     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3509 }
3510
3511
3512 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3513 {
3514     int i;
3515
3516 #ifdef CONFIG_ENCODERS
3517     if(avctx->dct_algo==FF_DCT_FASTINT) {
3518         c->fdct = fdct_ifast;
3519         c->fdct248 = fdct_ifast248;
3520     }
3521     else if(avctx->dct_algo==FF_DCT_FAAN) {
3522         c->fdct = ff_faandct;
3523         c->fdct248 = ff_faandct248;
3524     }
3525     else {
3526         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3527         c->fdct248 = ff_fdct248_islow;
3528     }
3529 #endif //CONFIG_ENCODERS
3530
3531     if(avctx->lowres==1){
3532         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3533             c->idct_put= ff_jref_idct4_put;
3534             c->idct_add= ff_jref_idct4_add;
3535         }else{
3536             c->idct_put= ff_h264_lowres_idct_put_c;
3537             c->idct_add= ff_h264_lowres_idct_add_c;
3538         }
3539         c->idct    = j_rev_dct4;
3540         c->idct_permutation_type= FF_NO_IDCT_PERM;
3541     }else if(avctx->lowres==2){
3542         c->idct_put= ff_jref_idct2_put;
3543         c->idct_add= ff_jref_idct2_add;
3544         c->idct    = j_rev_dct2;
3545         c->idct_permutation_type= FF_NO_IDCT_PERM;
3546     }else if(avctx->lowres==3){
3547         c->idct_put= ff_jref_idct1_put;
3548         c->idct_add= ff_jref_idct1_add;
3549         c->idct    = j_rev_dct1;
3550         c->idct_permutation_type= FF_NO_IDCT_PERM;
3551     }else{
3552         if(avctx->idct_algo==FF_IDCT_INT){
3553             c->idct_put= ff_jref_idct_put;
3554             c->idct_add= ff_jref_idct_add;
3555             c->idct    = j_rev_dct;
3556             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3557         }else{ //accurate/default
3558             c->idct_put= simple_idct_put;
3559             c->idct_add= simple_idct_add;
3560             c->idct    = simple_idct;
3561             c->idct_permutation_type= FF_NO_IDCT_PERM;
3562         }
3563     }
3564
3565     c->h264_idct_add= ff_h264_idct_add_c;
3566
3567     /* VP3 DSP support */
3568     c->vp3_dsp_init = vp3_dsp_init_c;
3569     c->vp3_idct = vp3_idct_c;
3570
3571     c->get_pixels = get_pixels_c;
3572     c->diff_pixels = diff_pixels_c;
3573     c->put_pixels_clamped = put_pixels_clamped_c;
3574     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3575     c->add_pixels_clamped = add_pixels_clamped_c;
3576     c->gmc1 = gmc1_c;
3577     c->gmc = gmc_c;
3578     c->clear_blocks = clear_blocks_c;
3579     c->pix_sum = pix_sum_c;
3580     c->pix_norm1 = pix_norm1_c;
3581
3582     /* TODO [0] 16  [1] 8 */
3583     c->pix_abs[0][0] = pix_abs16_c;
3584     c->pix_abs[0][1] = pix_abs16_x2_c;
3585     c->pix_abs[0][2] = pix_abs16_y2_c;
3586     c->pix_abs[0][3] = pix_abs16_xy2_c;
3587     c->pix_abs[1][0] = pix_abs8_c;
3588     c->pix_abs[1][1] = pix_abs8_x2_c;
3589     c->pix_abs[1][2] = pix_abs8_y2_c;
3590     c->pix_abs[1][3] = pix_abs8_xy2_c;
3591
3592 #define dspfunc(PFX, IDX, NUM) \
3593     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3594     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3595     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3596     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3597
3598     dspfunc(put, 0, 16);
3599     dspfunc(put_no_rnd, 0, 16);
3600     dspfunc(put, 1, 8);
3601     dspfunc(put_no_rnd, 1, 8);
3602     dspfunc(put, 2, 4);
3603     dspfunc(put, 3, 2);
3604
3605     dspfunc(avg, 0, 16);
3606     dspfunc(avg_no_rnd, 0, 16);
3607     dspfunc(avg, 1, 8);
3608     dspfunc(avg_no_rnd, 1, 8);
3609     dspfunc(avg, 2, 4);
3610     dspfunc(avg, 3, 2);
3611 #undef dspfunc
3612
3613     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3614     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3615
3616     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3617     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3618     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3619     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3620     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3621     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3622     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3623     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3624     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3625
3626     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3627     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3628     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3629     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3630     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3631     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3632     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3633     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3634     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3635
3636 #define dspfunc(PFX, IDX, NUM) \
3637     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3638     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3639     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3640     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3641     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3642     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3643     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3644     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3645     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3646     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3647     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3648     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3649     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3650     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3651     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3652     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3653
3654     dspfunc(put_qpel, 0, 16);
3655     dspfunc(put_no_rnd_qpel, 0, 16);
3656
3657     dspfunc(avg_qpel, 0, 16);
3658     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3659
3660     dspfunc(put_qpel, 1, 8);
3661     dspfunc(put_no_rnd_qpel, 1, 8);
3662
3663     dspfunc(avg_qpel, 1, 8);
3664     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3665
3666     dspfunc(put_h264_qpel, 0, 16);
3667     dspfunc(put_h264_qpel, 1, 8);
3668     dspfunc(put_h264_qpel, 2, 4);
3669     dspfunc(avg_h264_qpel, 0, 16);
3670     dspfunc(avg_h264_qpel, 1, 8);
3671     dspfunc(avg_h264_qpel, 2, 4);
3672
3673 #undef dspfunc
3674     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3675     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3676     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3677     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3678     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3679     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3680
3681     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3682     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3683     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3684     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3685     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3686     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3687     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3688     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3689     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3690     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3691     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3692     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3693     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3694     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3695     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3696     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3697     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3698     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3699     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3700     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3701
3702     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3703     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3704     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3705     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3706     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3707     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3708     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3709     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3710
3711 #define SET_CMP_FUNC(name) \
3712     c->name[0]= name ## 16_c;\
3713     c->name[1]= name ## 8x8_c;
3714
3715     SET_CMP_FUNC(hadamard8_diff)
3716     c->hadamard8_diff[4]= hadamard8_intra16_c;
3717     SET_CMP_FUNC(dct_sad)
3718     SET_CMP_FUNC(dct_max)
3719     c->sad[0]= pix_abs16_c;
3720     c->sad[1]= pix_abs8_c;
3721     c->sse[0]= sse16_c;
3722     c->sse[1]= sse8_c;
3723     c->sse[2]= sse4_c;
3724     SET_CMP_FUNC(quant_psnr)
3725     SET_CMP_FUNC(rd)
3726     SET_CMP_FUNC(bit)
3727     c->vsad[0]= vsad16_c;
3728     c->vsad[4]= vsad_intra16_c;
3729     c->vsse[0]= vsse16_c;
3730     c->vsse[4]= vsse_intra16_c;
3731     c->nsse[0]= nsse16_c;
3732     c->nsse[1]= nsse8_c;
3733     c->w53[0]= w53_16_c;
3734     c->w53[1]= w53_8_c;
3735     c->w97[0]= w97_16_c;
3736     c->w97[1]= w97_8_c;
3737
3738     c->add_bytes= add_bytes_c;
3739     c->diff_bytes= diff_bytes_c;
3740     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3741     c->bswap_buf= bswap_buf;
3742
3743     c->h263_h_loop_filter= h263_h_loop_filter_c;
3744     c->h263_v_loop_filter= h263_v_loop_filter_c;
3745
3746     c->h261_loop_filter= h261_loop_filter_c;
3747
3748     c->try_8x8basis= try_8x8basis_c;
3749     c->add_8x8basis= add_8x8basis_c;
3750
3751 #ifdef HAVE_MMX
3752     dsputil_init_mmx(c, avctx);
3753 #endif
3754 #ifdef ARCH_ARMV4L
3755     dsputil_init_armv4l(c, avctx);
3756 #endif
3757 #ifdef HAVE_MLIB
3758     dsputil_init_mlib(c, avctx);
3759 #endif
3760 #ifdef ARCH_SPARC
3761    dsputil_init_vis(c,avctx);
3762 #endif
3763 #ifdef ARCH_ALPHA
3764     dsputil_init_alpha(c, avctx);
3765 #endif
3766 #ifdef ARCH_POWERPC
3767     dsputil_init_ppc(c, avctx);
3768 #endif
3769 #ifdef HAVE_MMI
3770     dsputil_init_mmi(c, avctx);
3771 #endif
3772 #ifdef ARCH_SH4
3773     dsputil_init_sh4(c,avctx);
3774 #endif
3775
3776     switch(c->idct_permutation_type){
3777     case FF_NO_IDCT_PERM:
3778         for(i=0; i<64; i++)
3779             c->idct_permutation[i]= i;
3780         break;
3781     case FF_LIBMPEG2_IDCT_PERM:
3782         for(i=0; i<64; i++)
3783             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3784         break;
3785     case FF_SIMPLE_IDCT_PERM:
3786         for(i=0; i<64; i++)
3787             c->idct_permutation[i]= simple_mmx_permutation[i];
3788         break;
3789     case FF_TRANSPOSE_IDCT_PERM:
3790         for(i=0; i<64; i++)
3791             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3792         break;
3793     default:
3794         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3795     }
3796 }
3797