git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file dsputil.c
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "mpegvideo.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "h263.h"
  36 #include "snow.h"
  37
  38 /* snow.c */
  39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  40
  41 /* vorbis.c */
  42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
  43
  44 /* flacenc.c */
  45 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
  46
  47 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  48 uint32_t ff_squareTbl[512] = {0, };
  49
  50 const uint8_t ff_zigzag_direct[64] = {
  51     0,   1,  8, 16,  9,  2,  3, 10,
  52     17, 24, 32, 25, 18, 11,  4,  5,
  53     12, 19, 26, 33, 40, 48, 41, 34,
  54     27, 20, 13,  6,  7, 14, 21, 28,
  55     35, 42, 49, 56, 57, 50, 43, 36,
  56     29, 22, 15, 23, 30, 37, 44, 51,
  57     58, 59, 52, 45, 38, 31, 39, 46,
  58     53, 60, 61, 54, 47, 55, 62, 63
  59 };
  60
  61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  62    specification, we interleave the fields */
  63 const uint8_t ff_zigzag248_direct[64] = {
  64      0,  8,  1,  9, 16, 24,  2, 10,
  65     17, 25, 32, 40, 48, 56, 33, 41,
  66     18, 26,  3, 11,  4, 12, 19, 27,
  67     34, 42, 49, 57, 50, 58, 35, 43,
  68     20, 28,  5, 13,  6, 14, 21, 29,
  69     36, 44, 51, 59, 52, 60, 37, 45,
  70     22, 30,  7, 15, 23, 31, 38, 46,
  71     53, 61, 54, 62, 39, 47, 55, 63,
  72 };
  73
  74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  75 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
  76
  77 const uint8_t ff_alternate_horizontal_scan[64] = {
  78     0,  1,   2,  3,  8,  9, 16, 17,
  79     10, 11,  4,  5,  6,  7, 15, 14,
  80     13, 12, 19, 18, 24, 25, 32, 33,
  81     26, 27, 20, 21, 22, 23, 28, 29,
  82     30, 31, 34, 35, 40, 41, 48, 49,
  83     42, 43, 36, 37, 38, 39, 44, 45,
  84     46, 47, 50, 51, 56, 57, 58, 59,
  85     52, 53, 54, 55, 60, 61, 62, 63,
  86 };
  87
  88 const uint8_t ff_alternate_vertical_scan[64] = {
  89     0,  8,  16, 24,  1,  9,  2, 10,
  90     17, 25, 32, 40, 48, 56, 57, 49,
  91     41, 33, 26, 18,  3, 11,  4, 12,
  92     19, 27, 34, 42, 50, 58, 35, 43,
  93     51, 59, 20, 28,  5, 13,  6, 14,
  94     21, 29, 36, 44, 52, 60, 37, 45,
  95     53, 61, 22, 30,  7, 15, 23, 31,
  96     38, 46, 54, 62, 39, 47, 55, 63,
  97 };
  98
  99 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
 100 const uint32_t ff_inverse[256]={
 101          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 102  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 103  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 104  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 105  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 106  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 107   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 108   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 109   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 110   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 111   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 112   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 113   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 114   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 115   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 116   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 117   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 118   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 119   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 120   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 121   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 122   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 123   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 124   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 125   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 126   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 127   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 128   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 129   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 130   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 131   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 132   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 133 };
 134
 135 /* Input permutation for the simple_idct_mmx */
 136 static const uint8_t simple_mmx_permutation[64]={
 137         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 138         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 139         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 140         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 141         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 142         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 143         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 144         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 145 };
 146
 147 static int pix_sum_c(uint8_t * pix, int line_size)
 148 {
 149     int s, i, j;
 150
 151     s = 0;
 152     for (i = 0; i < 16; i++) {
 153         for (j = 0; j < 16; j += 8) {
 154             s += pix[0];
 155             s += pix[1];
 156             s += pix[2];
 157             s += pix[3];
 158             s += pix[4];
 159             s += pix[5];
 160             s += pix[6];
 161             s += pix[7];
 162             pix += 8;
 163         }
 164         pix += line_size - 16;
 165     }
 166     return s;
 167 }
 168
 169 static int pix_norm1_c(uint8_t * pix, int line_size)
 170 {
 171     int s, i, j;
 172     uint32_t *sq = ff_squareTbl + 256;
 173
 174     s = 0;
 175     for (i = 0; i < 16; i++) {
 176         for (j = 0; j < 16; j += 8) {
 177 #if 0
 178             s += sq[pix[0]];
 179             s += sq[pix[1]];
 180             s += sq[pix[2]];
 181             s += sq[pix[3]];
 182             s += sq[pix[4]];
 183             s += sq[pix[5]];
 184             s += sq[pix[6]];
 185             s += sq[pix[7]];
 186 #else
 187 #if LONG_MAX > 2147483647
 188             register uint64_t x=*(uint64_t*)pix;
 189             s += sq[x&0xff];
 190             s += sq[(x>>8)&0xff];
 191             s += sq[(x>>16)&0xff];
 192             s += sq[(x>>24)&0xff];
 193             s += sq[(x>>32)&0xff];
 194             s += sq[(x>>40)&0xff];
 195             s += sq[(x>>48)&0xff];
 196             s += sq[(x>>56)&0xff];
 197 #else
 198             register uint32_t x=*(uint32_t*)pix;
 199             s += sq[x&0xff];
 200             s += sq[(x>>8)&0xff];
 201             s += sq[(x>>16)&0xff];
 202             s += sq[(x>>24)&0xff];
 203             x=*(uint32_t*)(pix+4);
 204             s += sq[x&0xff];
 205             s += sq[(x>>8)&0xff];
 206             s += sq[(x>>16)&0xff];
 207             s += sq[(x>>24)&0xff];
 208 #endif
 209 #endif
 210             pix += 8;
 211         }
 212         pix += line_size - 16;
 213     }
 214     return s;
 215 }
 216
 217 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 218     int i;
 219
 220     for(i=0; i+8<=w; i+=8){
 221         dst[i+0]= bswap_32(src[i+0]);
 222         dst[i+1]= bswap_32(src[i+1]);
 223         dst[i+2]= bswap_32(src[i+2]);
 224         dst[i+3]= bswap_32(src[i+3]);
 225         dst[i+4]= bswap_32(src[i+4]);
 226         dst[i+5]= bswap_32(src[i+5]);
 227         dst[i+6]= bswap_32(src[i+6]);
 228         dst[i+7]= bswap_32(src[i+7]);
 229     }
 230     for(;i<w; i++){
 231         dst[i+0]= bswap_32(src[i+0]);
 232     }
 233 }
 234
 235 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 236 {
 237     int s, i;
 238     uint32_t *sq = ff_squareTbl + 256;
 239
 240     s = 0;
 241     for (i = 0; i < h; i++) {
 242         s += sq[pix1[0] - pix2[0]];
 243         s += sq[pix1[1] - pix2[1]];
 244         s += sq[pix1[2] - pix2[2]];
 245         s += sq[pix1[3] - pix2[3]];
 246         pix1 += line_size;
 247         pix2 += line_size;
 248     }
 249     return s;
 250 }
 251
 252 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 253 {
 254     int s, i;
 255     uint32_t *sq = ff_squareTbl + 256;
 256
 257     s = 0;
 258     for (i = 0; i < h; i++) {
 259         s += sq[pix1[0] - pix2[0]];
 260         s += sq[pix1[1] - pix2[1]];
 261         s += sq[pix1[2] - pix2[2]];
 262         s += sq[pix1[3] - pix2[3]];
 263         s += sq[pix1[4] - pix2[4]];
 264         s += sq[pix1[5] - pix2[5]];
 265         s += sq[pix1[6] - pix2[6]];
 266         s += sq[pix1[7] - pix2[7]];
 267         pix1 += line_size;
 268         pix2 += line_size;
 269     }
 270     return s;
 271 }
 272
 273 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 274 {
 275     int s, i;
 276     uint32_t *sq = ff_squareTbl + 256;
 277
 278     s = 0;
 279     for (i = 0; i < h; i++) {
 280         s += sq[pix1[ 0] - pix2[ 0]];
 281         s += sq[pix1[ 1] - pix2[ 1]];
 282         s += sq[pix1[ 2] - pix2[ 2]];
 283         s += sq[pix1[ 3] - pix2[ 3]];
 284         s += sq[pix1[ 4] - pix2[ 4]];
 285         s += sq[pix1[ 5] - pix2[ 5]];
 286         s += sq[pix1[ 6] - pix2[ 6]];
 287         s += sq[pix1[ 7] - pix2[ 7]];
 288         s += sq[pix1[ 8] - pix2[ 8]];
 289         s += sq[pix1[ 9] - pix2[ 9]];
 290         s += sq[pix1[10] - pix2[10]];
 291         s += sq[pix1[11] - pix2[11]];
 292         s += sq[pix1[12] - pix2[12]];
 293         s += sq[pix1[13] - pix2[13]];
 294         s += sq[pix1[14] - pix2[14]];
 295         s += sq[pix1[15] - pix2[15]];
 296
 297         pix1 += line_size;
 298         pix2 += line_size;
 299     }
 300     return s;
 301 }
 302
 303
 304 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
 305 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 306     int s, i, j;
 307     const int dec_count= w==8 ? 3 : 4;
 308     int tmp[32*32];
 309     int level, ori;
 310     static const int scale[2][2][4][4]={
 311       {
 312         {
 313             // 9/7 8x8 dec=3
 314             {268, 239, 239, 213},
 315             {  0, 224, 224, 152},
 316             {  0, 135, 135, 110},
 317         },{
 318             // 9/7 16x16 or 32x32 dec=4
 319             {344, 310, 310, 280},
 320             {  0, 320, 320, 228},
 321             {  0, 175, 175, 136},
 322             {  0, 129, 129, 102},
 323         }
 324       },{
 325         {
 326             // 5/3 8x8 dec=3
 327             {275, 245, 245, 218},
 328             {  0, 230, 230, 156},
 329             {  0, 138, 138, 113},
 330         },{
 331             // 5/3 16x16 or 32x32 dec=4
 332             {352, 317, 317, 286},
 333             {  0, 328, 328, 233},
 334             {  0, 180, 180, 140},
 335             {  0, 132, 132, 105},
 336         }
 337       }
 338     };
 339
 340     for (i = 0; i < h; i++) {
 341         for (j = 0; j < w; j+=4) {
 342             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 343             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 344             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 345             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 346         }
 347         pix1 += line_size;
 348         pix2 += line_size;
 349     }
 350
 351     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 352
 353     s=0;
 354     assert(w==h);
 355     for(level=0; level<dec_count; level++){
 356         for(ori= level ? 1 : 0; ori<4; ori++){
 357             int size= w>>(dec_count-level);
 358             int sx= (ori&1) ? size : 0;
 359             int stride= 32<<(dec_count-level);
 360             int sy= (ori&2) ? stride>>1 : 0;
 361
 362             for(i=0; i<size; i++){
 363                 for(j=0; j<size; j++){
 364                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 365                     s += FFABS(v);
 366                 }
 367             }
 368         }
 369     }
 370     assert(s>=0);
 371     return s>>9;
 372 }
 373
 374 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 375     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 376 }
 377
 378 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 379     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 380 }
 381
 382 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 383     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 384 }
 385
 386 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 387     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 388 }
 389
 390 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 391     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 392 }
 393
 394 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 395     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 396 }
 397 #endif
 398
 399 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 400 {
 401     int i;
 402
 403     /* read the pixels */
 404     for(i=0;i<8;i++) {
 405         block[0] = pixels[0];
 406         block[1] = pixels[1];
 407         block[2] = pixels[2];
 408         block[3] = pixels[3];
 409         block[4] = pixels[4];
 410         block[5] = pixels[5];
 411         block[6] = pixels[6];
 412         block[7] = pixels[7];
 413         pixels += line_size;
 414         block += 8;
 415     }
 416 }
 417
 418 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 419                           const uint8_t *s2, int stride){
 420     int i;
 421
 422     /* read the pixels */
 423     for(i=0;i<8;i++) {
 424         block[0] = s1[0] - s2[0];
 425         block[1] = s1[1] - s2[1];
 426         block[2] = s1[2] - s2[2];
 427         block[3] = s1[3] - s2[3];
 428         block[4] = s1[4] - s2[4];
 429         block[5] = s1[5] - s2[5];
 430         block[6] = s1[6] - s2[6];
 431         block[7] = s1[7] - s2[7];
 432         s1 += stride;
 433         s2 += stride;
 434         block += 8;
 435     }
 436 }
 437
 438
 439 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 440                                  int line_size)
 441 {
 442     int i;
 443     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 444
 445     /* read the pixels */
 446     for(i=0;i<8;i++) {
 447         pixels[0] = cm[block[0]];
 448         pixels[1] = cm[block[1]];
 449         pixels[2] = cm[block[2]];
 450         pixels[3] = cm[block[3]];
 451         pixels[4] = cm[block[4]];
 452         pixels[5] = cm[block[5]];
 453         pixels[6] = cm[block[6]];
 454         pixels[7] = cm[block[7]];
 455
 456         pixels += line_size;
 457         block += 8;
 458     }
 459 }
 460
 461 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 462                                  int line_size)
 463 {
 464     int i;
 465     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 466
 467     /* read the pixels */
 468     for(i=0;i<4;i++) {
 469         pixels[0] = cm[block[0]];
 470         pixels[1] = cm[block[1]];
 471         pixels[2] = cm[block[2]];
 472         pixels[3] = cm[block[3]];
 473
 474         pixels += line_size;
 475         block += 8;
 476     }
 477 }
 478
 479 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 480                                  int line_size)
 481 {
 482     int i;
 483     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 484
 485     /* read the pixels */
 486     for(i=0;i<2;i++) {
 487         pixels[0] = cm[block[0]];
 488         pixels[1] = cm[block[1]];
 489
 490         pixels += line_size;
 491         block += 8;
 492     }
 493 }
 494
 495 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 496                                         uint8_t *restrict pixels,
 497                                         int line_size)
 498 {
 499     int i, j;
 500
 501     for (i = 0; i < 8; i++) {
 502         for (j = 0; j < 8; j++) {
 503             if (*block < -128)
 504                 *pixels = 0;
 505             else if (*block > 127)
 506                 *pixels = 255;
 507             else
 508                 *pixels = (uint8_t)(*block + 128);
 509             block++;
 510             pixels++;
 511         }
 512         pixels += (line_size - 8);
 513     }
 514 }
 515
 516 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 517                           int line_size)
 518 {
 519     int i;
 520     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 521
 522     /* read the pixels */
 523     for(i=0;i<8;i++) {
 524         pixels[0] = cm[pixels[0] + block[0]];
 525         pixels[1] = cm[pixels[1] + block[1]];
 526         pixels[2] = cm[pixels[2] + block[2]];
 527         pixels[3] = cm[pixels[3] + block[3]];
 528         pixels[4] = cm[pixels[4] + block[4]];
 529         pixels[5] = cm[pixels[5] + block[5]];
 530         pixels[6] = cm[pixels[6] + block[6]];
 531         pixels[7] = cm[pixels[7] + block[7]];
 532         pixels += line_size;
 533         block += 8;
 534     }
 535 }
 536
 537 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 538                           int line_size)
 539 {
 540     int i;
 541     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 542
 543     /* read the pixels */
 544     for(i=0;i<4;i++) {
 545         pixels[0] = cm[pixels[0] + block[0]];
 546         pixels[1] = cm[pixels[1] + block[1]];
 547         pixels[2] = cm[pixels[2] + block[2]];
 548         pixels[3] = cm[pixels[3] + block[3]];
 549         pixels += line_size;
 550         block += 8;
 551     }
 552 }
 553
 554 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 555                           int line_size)
 556 {
 557     int i;
 558     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 559
 560     /* read the pixels */
 561     for(i=0;i<2;i++) {
 562         pixels[0] = cm[pixels[0] + block[0]];
 563         pixels[1] = cm[pixels[1] + block[1]];
 564         pixels += line_size;
 565         block += 8;
 566     }
 567 }
 568
 569 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 570 {
 571     int i;
 572     for(i=0;i<8;i++) {
 573         pixels[0] += block[0];
 574         pixels[1] += block[1];
 575         pixels[2] += block[2];
 576         pixels[3] += block[3];
 577         pixels[4] += block[4];
 578         pixels[5] += block[5];
 579         pixels[6] += block[6];
 580         pixels[7] += block[7];
 581         pixels += line_size;
 582         block += 8;
 583     }
 584 }
 585
 586 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 587 {
 588     int i;
 589     for(i=0;i<4;i++) {
 590         pixels[0] += block[0];
 591         pixels[1] += block[1];
 592         pixels[2] += block[2];
 593         pixels[3] += block[3];
 594         pixels += line_size;
 595         block += 4;
 596     }
 597 }
 598
 599 static int sum_abs_dctelem_c(DCTELEM *block)
 600 {
 601     int sum=0, i;
 602     for(i=0; i<64; i++)
 603         sum+= FFABS(block[i]);
 604     return sum;
 605 }
 606
 607 #if 0
 608
 609 #define PIXOP2(OPNAME, OP) \
 610 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 611 {\
 612     int i;\
 613     for(i=0; i<h; i++){\
 614         OP(*((uint64_t*)block), AV_RN64(pixels));\
 615         pixels+=line_size;\
 616         block +=line_size;\
 617     }\
 618 }\
 619 \
 620 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 621 {\
 622     int i;\
 623     for(i=0; i<h; i++){\
 624         const uint64_t a= AV_RN64(pixels  );\
 625         const uint64_t b= AV_RN64(pixels+1);\
 626         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 627         pixels+=line_size;\
 628         block +=line_size;\
 629     }\
 630 }\
 631 \
 632 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 633 {\
 634     int i;\
 635     for(i=0; i<h; i++){\
 636         const uint64_t a= AV_RN64(pixels  );\
 637         const uint64_t b= AV_RN64(pixels+1);\
 638         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 639         pixels+=line_size;\
 640         block +=line_size;\
 641     }\
 642 }\
 643 \
 644 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 645 {\
 646     int i;\
 647     for(i=0; i<h; i++){\
 648         const uint64_t a= AV_RN64(pixels          );\
 649         const uint64_t b= AV_RN64(pixels+line_size);\
 650         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 651         pixels+=line_size;\
 652         block +=line_size;\
 653     }\
 654 }\
 655 \
 656 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 657 {\
 658     int i;\
 659     for(i=0; i<h; i++){\
 660         const uint64_t a= AV_RN64(pixels          );\
 661         const uint64_t b= AV_RN64(pixels+line_size);\
 662         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 663         pixels+=line_size;\
 664         block +=line_size;\
 665     }\
 666 }\
 667 \
 668 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 669 {\
 670         int i;\
 671         const uint64_t a= AV_RN64(pixels  );\
 672         const uint64_t b= AV_RN64(pixels+1);\
 673         uint64_t l0=  (a&0x0303030303030303ULL)\
 674                     + (b&0x0303030303030303ULL)\
 675                     + 0x0202020202020202ULL;\
 676         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 677                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 678         uint64_t l1,h1;\
 679 \
 680         pixels+=line_size;\
 681         for(i=0; i<h; i+=2){\
 682             uint64_t a= AV_RN64(pixels  );\
 683             uint64_t b= AV_RN64(pixels+1);\
 684             l1=  (a&0x0303030303030303ULL)\
 685                + (b&0x0303030303030303ULL);\
 686             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 687               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 688             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 689             pixels+=line_size;\
 690             block +=line_size;\
 691             a= AV_RN64(pixels  );\
 692             b= AV_RN64(pixels+1);\
 693             l0=  (a&0x0303030303030303ULL)\
 694                + (b&0x0303030303030303ULL)\
 695                + 0x0202020202020202ULL;\
 696             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 697               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 698             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 699             pixels+=line_size;\
 700             block +=line_size;\
 701         }\
 702 }\
 703 \
 704 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 705 {\
 706         int i;\
 707         const uint64_t a= AV_RN64(pixels  );\
 708         const uint64_t b= AV_RN64(pixels+1);\
 709         uint64_t l0=  (a&0x0303030303030303ULL)\
 710                     + (b&0x0303030303030303ULL)\
 711                     + 0x0101010101010101ULL;\
 712         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 713                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 714         uint64_t l1,h1;\
 715 \
 716         pixels+=line_size;\
 717         for(i=0; i<h; i+=2){\
 718             uint64_t a= AV_RN64(pixels  );\
 719             uint64_t b= AV_RN64(pixels+1);\
 720             l1=  (a&0x0303030303030303ULL)\
 721                + (b&0x0303030303030303ULL);\
 722             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 723               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 724             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 725             pixels+=line_size;\
 726             block +=line_size;\
 727             a= AV_RN64(pixels  );\
 728             b= AV_RN64(pixels+1);\
 729             l0=  (a&0x0303030303030303ULL)\
 730                + (b&0x0303030303030303ULL)\
 731                + 0x0101010101010101ULL;\
 732             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 733               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 734             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 735             pixels+=line_size;\
 736             block +=line_size;\
 737         }\
 738 }\
 739 \
 740 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 741 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 742 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 743 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 744 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 745 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 746 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 747
 748 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 749 #else // 64 bit variant
 750
 751 #define PIXOP2(OPNAME, OP) \
 752 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 753     int i;\
 754     for(i=0; i<h; i++){\
 755         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 756         pixels+=line_size;\
 757         block +=line_size;\
 758     }\
 759 }\
 760 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 761     int i;\
 762     for(i=0; i<h; i++){\
 763         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 764         pixels+=line_size;\
 765         block +=line_size;\
 766     }\
 767 }\
 768 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 769     int i;\
 770     for(i=0; i<h; i++){\
 771         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 772         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 773         pixels+=line_size;\
 774         block +=line_size;\
 775     }\
 776 }\
 777 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 778     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 779 }\
 780 \
 781 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 782                                                 int src_stride1, int src_stride2, int h){\
 783     int i;\
 784     for(i=0; i<h; i++){\
 785         uint32_t a,b;\
 786         a= AV_RN32(&src1[i*src_stride1  ]);\
 787         b= AV_RN32(&src2[i*src_stride2  ]);\
 788         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 789         a= AV_RN32(&src1[i*src_stride1+4]);\
 790         b= AV_RN32(&src2[i*src_stride2+4]);\
 791         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 792     }\
 793 }\
 794 \
 795 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 796                                                 int src_stride1, int src_stride2, int h){\
 797     int i;\
 798     for(i=0; i<h; i++){\
 799         uint32_t a,b;\
 800         a= AV_RN32(&src1[i*src_stride1  ]);\
 801         b= AV_RN32(&src2[i*src_stride2  ]);\
 802         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 803         a= AV_RN32(&src1[i*src_stride1+4]);\
 804         b= AV_RN32(&src2[i*src_stride2+4]);\
 805         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 806     }\
 807 }\
 808 \
 809 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 810                                                 int src_stride1, int src_stride2, int h){\
 811     int i;\
 812     for(i=0; i<h; i++){\
 813         uint32_t a,b;\
 814         a= AV_RN32(&src1[i*src_stride1  ]);\
 815         b= AV_RN32(&src2[i*src_stride2  ]);\
 816         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 817     }\
 818 }\
 819 \
 820 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 821                                                 int src_stride1, int src_stride2, int h){\
 822     int i;\
 823     for(i=0; i<h; i++){\
 824         uint32_t a,b;\
 825         a= AV_RN16(&src1[i*src_stride1  ]);\
 826         b= AV_RN16(&src2[i*src_stride2  ]);\
 827         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 828     }\
 829 }\
 830 \
 831 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 832                                                 int src_stride1, int src_stride2, int h){\
 833     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 834     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 835 }\
 836 \
 837 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 838                                                 int src_stride1, int src_stride2, int h){\
 839     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 840     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 841 }\
 842 \
 843 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 844     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 845 }\
 846 \
 847 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 848     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 849 }\
 850 \
 851 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 852     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 853 }\
 854 \
 855 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 856     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 857 }\
 858 \
 859 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 860                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 861     int i;\
 862     for(i=0; i<h; i++){\
 863         uint32_t a, b, c, d, l0, l1, h0, h1;\
 864         a= AV_RN32(&src1[i*src_stride1]);\
 865         b= AV_RN32(&src2[i*src_stride2]);\
 866         c= AV_RN32(&src3[i*src_stride3]);\
 867         d= AV_RN32(&src4[i*src_stride4]);\
 868         l0=  (a&0x03030303UL)\
 869            + (b&0x03030303UL)\
 870            + 0x02020202UL;\
 871         h0= ((a&0xFCFCFCFCUL)>>2)\
 872           + ((b&0xFCFCFCFCUL)>>2);\
 873         l1=  (c&0x03030303UL)\
 874            + (d&0x03030303UL);\
 875         h1= ((c&0xFCFCFCFCUL)>>2)\
 876           + ((d&0xFCFCFCFCUL)>>2);\
 877         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 878         a= AV_RN32(&src1[i*src_stride1+4]);\
 879         b= AV_RN32(&src2[i*src_stride2+4]);\
 880         c= AV_RN32(&src3[i*src_stride3+4]);\
 881         d= AV_RN32(&src4[i*src_stride4+4]);\
 882         l0=  (a&0x03030303UL)\
 883            + (b&0x03030303UL)\
 884            + 0x02020202UL;\
 885         h0= ((a&0xFCFCFCFCUL)>>2)\
 886           + ((b&0xFCFCFCFCUL)>>2);\
 887         l1=  (c&0x03030303UL)\
 888            + (d&0x03030303UL);\
 889         h1= ((c&0xFCFCFCFCUL)>>2)\
 890           + ((d&0xFCFCFCFCUL)>>2);\
 891         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 892     }\
 893 }\
 894 \
 895 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 896     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 897 }\
 898 \
 899 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 900     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 901 }\
 902 \
 903 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 904     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 905 }\
 906 \
 907 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 908     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 909 }\
 910 \
 911 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 912                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 913     int i;\
 914     for(i=0; i<h; i++){\
 915         uint32_t a, b, c, d, l0, l1, h0, h1;\
 916         a= AV_RN32(&src1[i*src_stride1]);\
 917         b= AV_RN32(&src2[i*src_stride2]);\
 918         c= AV_RN32(&src3[i*src_stride3]);\
 919         d= AV_RN32(&src4[i*src_stride4]);\
 920         l0=  (a&0x03030303UL)\
 921            + (b&0x03030303UL)\
 922            + 0x01010101UL;\
 923         h0= ((a&0xFCFCFCFCUL)>>2)\
 924           + ((b&0xFCFCFCFCUL)>>2);\
 925         l1=  (c&0x03030303UL)\
 926            + (d&0x03030303UL);\
 927         h1= ((c&0xFCFCFCFCUL)>>2)\
 928           + ((d&0xFCFCFCFCUL)>>2);\
 929         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 930         a= AV_RN32(&src1[i*src_stride1+4]);\
 931         b= AV_RN32(&src2[i*src_stride2+4]);\
 932         c= AV_RN32(&src3[i*src_stride3+4]);\
 933         d= AV_RN32(&src4[i*src_stride4+4]);\
 934         l0=  (a&0x03030303UL)\
 935            + (b&0x03030303UL)\
 936            + 0x01010101UL;\
 937         h0= ((a&0xFCFCFCFCUL)>>2)\
 938           + ((b&0xFCFCFCFCUL)>>2);\
 939         l1=  (c&0x03030303UL)\
 940            + (d&0x03030303UL);\
 941         h1= ((c&0xFCFCFCFCUL)>>2)\
 942           + ((d&0xFCFCFCFCUL)>>2);\
 943         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 944     }\
 945 }\
 946 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 947                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 948     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 949     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 950 }\
 951 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 952                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 953     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 954     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 955 }\
 956 \
 957 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 958 {\
 959         int i, a0, b0, a1, b1;\
 960         a0= pixels[0];\
 961         b0= pixels[1] + 2;\
 962         a0 += b0;\
 963         b0 += pixels[2];\
 964 \
 965         pixels+=line_size;\
 966         for(i=0; i<h; i+=2){\
 967             a1= pixels[0];\
 968             b1= pixels[1];\
 969             a1 += b1;\
 970             b1 += pixels[2];\
 971 \
 972             block[0]= (a1+a0)>>2; /* FIXME non put */\
 973             block[1]= (b1+b0)>>2;\
 974 \
 975             pixels+=line_size;\
 976             block +=line_size;\
 977 \
 978             a0= pixels[0];\
 979             b0= pixels[1] + 2;\
 980             a0 += b0;\
 981             b0 += pixels[2];\
 982 \
 983             block[0]= (a1+a0)>>2;\
 984             block[1]= (b1+b0)>>2;\
 985             pixels+=line_size;\
 986             block +=line_size;\
 987         }\
 988 }\
 989 \
 990 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 991 {\
 992         int i;\
 993         const uint32_t a= AV_RN32(pixels  );\
 994         const uint32_t b= AV_RN32(pixels+1);\
 995         uint32_t l0=  (a&0x03030303UL)\
 996                     + (b&0x03030303UL)\
 997                     + 0x02020202UL;\
 998         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 999                    + ((b&0xFCFCFCFCUL)>>2);\
1000         uint32_t l1,h1;\
1001 \
1002         pixels+=line_size;\
1003         for(i=0; i<h; i+=2){\
1004             uint32_t a= AV_RN32(pixels  );\
1005             uint32_t b= AV_RN32(pixels+1);\
1006             l1=  (a&0x03030303UL)\
1007                + (b&0x03030303UL);\
1008             h1= ((a&0xFCFCFCFCUL)>>2)\
1009               + ((b&0xFCFCFCFCUL)>>2);\
1010             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1011             pixels+=line_size;\
1012             block +=line_size;\
1013             a= AV_RN32(pixels  );\
1014             b= AV_RN32(pixels+1);\
1015             l0=  (a&0x03030303UL)\
1016                + (b&0x03030303UL)\
1017                + 0x02020202UL;\
1018             h0= ((a&0xFCFCFCFCUL)>>2)\
1019               + ((b&0xFCFCFCFCUL)>>2);\
1020             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1021             pixels+=line_size;\
1022             block +=line_size;\
1023         }\
1024 }\
1025 \
1026 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1027 {\
1028     int j;\
1029     for(j=0; j<2; j++){\
1030         int i;\
1031         const uint32_t a= AV_RN32(pixels  );\
1032         const uint32_t b= AV_RN32(pixels+1);\
1033         uint32_t l0=  (a&0x03030303UL)\
1034                     + (b&0x03030303UL)\
1035                     + 0x02020202UL;\
1036         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1037                    + ((b&0xFCFCFCFCUL)>>2);\
1038         uint32_t l1,h1;\
1039 \
1040         pixels+=line_size;\
1041         for(i=0; i<h; i+=2){\
1042             uint32_t a= AV_RN32(pixels  );\
1043             uint32_t b= AV_RN32(pixels+1);\
1044             l1=  (a&0x03030303UL)\
1045                + (b&0x03030303UL);\
1046             h1= ((a&0xFCFCFCFCUL)>>2)\
1047               + ((b&0xFCFCFCFCUL)>>2);\
1048             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1049             pixels+=line_size;\
1050             block +=line_size;\
1051             a= AV_RN32(pixels  );\
1052             b= AV_RN32(pixels+1);\
1053             l0=  (a&0x03030303UL)\
1054                + (b&0x03030303UL)\
1055                + 0x02020202UL;\
1056             h0= ((a&0xFCFCFCFCUL)>>2)\
1057               + ((b&0xFCFCFCFCUL)>>2);\
1058             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1059             pixels+=line_size;\
1060             block +=line_size;\
1061         }\
1062         pixels+=4-line_size*(h+1);\
1063         block +=4-line_size*h;\
1064     }\
1065 }\
1066 \
1067 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1068 {\
1069     int j;\
1070     for(j=0; j<2; j++){\
1071         int i;\
1072         const uint32_t a= AV_RN32(pixels  );\
1073         const uint32_t b= AV_RN32(pixels+1);\
1074         uint32_t l0=  (a&0x03030303UL)\
1075                     + (b&0x03030303UL)\
1076                     + 0x01010101UL;\
1077         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1078                    + ((b&0xFCFCFCFCUL)>>2);\
1079         uint32_t l1,h1;\
1080 \
1081         pixels+=line_size;\
1082         for(i=0; i<h; i+=2){\
1083             uint32_t a= AV_RN32(pixels  );\
1084             uint32_t b= AV_RN32(pixels+1);\
1085             l1=  (a&0x03030303UL)\
1086                + (b&0x03030303UL);\
1087             h1= ((a&0xFCFCFCFCUL)>>2)\
1088               + ((b&0xFCFCFCFCUL)>>2);\
1089             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1090             pixels+=line_size;\
1091             block +=line_size;\
1092             a= AV_RN32(pixels  );\
1093             b= AV_RN32(pixels+1);\
1094             l0=  (a&0x03030303UL)\
1095                + (b&0x03030303UL)\
1096                + 0x01010101UL;\
1097             h0= ((a&0xFCFCFCFCUL)>>2)\
1098               + ((b&0xFCFCFCFCUL)>>2);\
1099             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1100             pixels+=line_size;\
1101             block +=line_size;\
1102         }\
1103         pixels+=4-line_size*(h+1);\
1104         block +=4-line_size*h;\
1105     }\
1106 }\
1107 \
1108 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1109 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1110 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1111 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1112 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1113 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1114 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1115 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1116
1117 #define op_avg(a, b) a = rnd_avg32(a, b)
1118 #endif
1119 #define op_put(a, b) a = b
1120
1121 PIXOP2(avg, op_avg)
1122 PIXOP2(put, op_put)
1123 #undef op_avg
1124 #undef op_put
1125
1126 #define avg2(a,b) ((a+b+1)>>1)
1127 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1128
1129 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1130     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1131 }
1132
1133 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1134     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1135 }
1136
1137 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1138 {
1139     const int A=(16-x16)*(16-y16);
1140     const int B=(   x16)*(16-y16);
1141     const int C=(16-x16)*(   y16);
1142     const int D=(   x16)*(   y16);
1143     int i;
1144
1145     for(i=0; i<h; i++)
1146     {
1147         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1148         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1149         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1150         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1151         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1152         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1153         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1154         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1155         dst+= stride;
1156         src+= stride;
1157     }
1158 }
1159
1160 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1161                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1162 {
1163     int y, vx, vy;
1164     const int s= 1<<shift;
1165
1166     width--;
1167     height--;
1168
1169     for(y=0; y<h; y++){
1170         int x;
1171
1172         vx= ox;
1173         vy= oy;
1174         for(x=0; x<8; x++){ //XXX FIXME optimize
1175             int src_x, src_y, frac_x, frac_y, index;
1176
1177             src_x= vx>>16;
1178             src_y= vy>>16;
1179             frac_x= src_x&(s-1);
1180             frac_y= src_y&(s-1);
1181             src_x>>=shift;
1182             src_y>>=shift;
1183
1184             if((unsigned)src_x < width){
1185                 if((unsigned)src_y < height){
1186                     index= src_x + src_y*stride;
1187                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1188                                            + src[index       +1]*   frac_x )*(s-frac_y)
1189                                         + (  src[index+stride  ]*(s-frac_x)
1190                                            + src[index+stride+1]*   frac_x )*   frac_y
1191                                         + r)>>(shift*2);
1192                 }else{
1193                     index= src_x + av_clip(src_y, 0, height)*stride;
1194                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1195                                           + src[index       +1]*   frac_x )*s
1196                                         + r)>>(shift*2);
1197                 }
1198             }else{
1199                 if((unsigned)src_y < height){
1200                     index= av_clip(src_x, 0, width) + src_y*stride;
1201                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1202                                            + src[index+stride  ]*   frac_y )*s
1203                                         + r)>>(shift*2);
1204                 }else{
1205                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1206                     dst[y*stride + x]=    src[index         ];
1207                 }
1208             }
1209
1210             vx+= dxx;
1211             vy+= dyx;
1212         }
1213         ox += dxy;
1214         oy += dyy;
1215     }
1216 }
1217
1218 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1219     switch(width){
1220     case 2: put_pixels2_c (dst, src, stride, height); break;
1221     case 4: put_pixels4_c (dst, src, stride, height); break;
1222     case 8: put_pixels8_c (dst, src, stride, height); break;
1223     case 16:put_pixels16_c(dst, src, stride, height); break;
1224     }
1225 }
1226
1227 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1228     int i,j;
1229     for (i=0; i < height; i++) {
1230       for (j=0; j < width; j++) {
1231         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1232       }
1233       src += stride;
1234       dst += stride;
1235     }
1236 }
1237
1238 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1239     int i,j;
1240     for (i=0; i < height; i++) {
1241       for (j=0; j < width; j++) {
1242         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1243       }
1244       src += stride;
1245       dst += stride;
1246     }
1247 }
1248
1249 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1250     int i,j;
1251     for (i=0; i < height; i++) {
1252       for (j=0; j < width; j++) {
1253         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1254       }
1255       src += stride;
1256       dst += stride;
1257     }
1258 }
1259
1260 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1261     int i,j;
1262     for (i=0; i < height; i++) {
1263       for (j=0; j < width; j++) {
1264         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1265       }
1266       src += stride;
1267       dst += stride;
1268     }
1269 }
1270
1271 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1272     int i,j;
1273     for (i=0; i < height; i++) {
1274       for (j=0; j < width; j++) {
1275         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1276       }
1277       src += stride;
1278       dst += stride;
1279     }
1280 }
1281
1282 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1283     int i,j;
1284     for (i=0; i < height; i++) {
1285       for (j=0; j < width; j++) {
1286         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1287       }
1288       src += stride;
1289       dst += stride;
1290     }
1291 }
1292
1293 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1294     int i,j;
1295     for (i=0; i < height; i++) {
1296       for (j=0; j < width; j++) {
1297         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1298       }
1299       src += stride;
1300       dst += stride;
1301     }
1302 }
1303
1304 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1305     int i,j;
1306     for (i=0; i < height; i++) {
1307       for (j=0; j < width; j++) {
1308         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1309       }
1310       src += stride;
1311       dst += stride;
1312     }
1313 }
1314
1315 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1316     switch(width){
1317     case 2: avg_pixels2_c (dst, src, stride, height); break;
1318     case 4: avg_pixels4_c (dst, src, stride, height); break;
1319     case 8: avg_pixels8_c (dst, src, stride, height); break;
1320     case 16:avg_pixels16_c(dst, src, stride, height); break;
1321     }
1322 }
1323
1324 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1325     int i,j;
1326     for (i=0; i < height; i++) {
1327       for (j=0; j < width; j++) {
1328         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1329       }
1330       src += stride;
1331       dst += stride;
1332     }
1333 }
1334
1335 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1336     int i,j;
1337     for (i=0; i < height; i++) {
1338       for (j=0; j < width; j++) {
1339         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1340       }
1341       src += stride;
1342       dst += stride;
1343     }
1344 }
1345
1346 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1347     int i,j;
1348     for (i=0; i < height; i++) {
1349       for (j=0; j < width; j++) {
1350         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1351       }
1352       src += stride;
1353       dst += stride;
1354     }
1355 }
1356
1357 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1358     int i,j;
1359     for (i=0; i < height; i++) {
1360       for (j=0; j < width; j++) {
1361         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1362       }
1363       src += stride;
1364       dst += stride;
1365     }
1366 }
1367
1368 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1369     int i,j;
1370     for (i=0; i < height; i++) {
1371       for (j=0; j < width; j++) {
1372         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1373       }
1374       src += stride;
1375       dst += stride;
1376     }
1377 }
1378
1379 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1380     int i,j;
1381     for (i=0; i < height; i++) {
1382       for (j=0; j < width; j++) {
1383         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1384       }
1385       src += stride;
1386       dst += stride;
1387     }
1388 }
1389
1390 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1391     int i,j;
1392     for (i=0; i < height; i++) {
1393       for (j=0; j < width; j++) {
1394         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1395       }
1396       src += stride;
1397       dst += stride;
1398     }
1399 }
1400
1401 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1402     int i,j;
1403     for (i=0; i < height; i++) {
1404       for (j=0; j < width; j++) {
1405         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1406       }
1407       src += stride;
1408       dst += stride;
1409     }
1410 }
1411 #if 0
1412 #define TPEL_WIDTH(width)\
1413 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1417 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1418     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1419 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1420     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1421 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1422     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1423 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1424     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1425 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1426     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1427 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1428     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1429 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1430     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1431 #endif
1432
1433 #define H264_CHROMA_MC(OPNAME, OP)\
1434 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1435     const int A=(8-x)*(8-y);\
1436     const int B=(  x)*(8-y);\
1437     const int C=(8-x)*(  y);\
1438     const int D=(  x)*(  y);\
1439     int i;\
1440     \
1441     assert(x<8 && y<8 && x>=0 && y>=0);\
1442 \
1443     if(D){\
1444         for(i=0; i<h; i++)\
1445         {\
1446             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1447             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1448             dst+= stride;\
1449             src+= stride;\
1450         }\
1451     }else{\
1452         const int E= B+C;\
1453         const int step= C ? stride : 1;\
1454         for(i=0; i<h; i++)\
1455         {\
1456             OP(dst[0], (A*src[0] + E*src[step+0]));\
1457             OP(dst[1], (A*src[1] + E*src[step+1]));\
1458             dst+= stride;\
1459             src+= stride;\
1460         }\
1461     }\
1462 }\
1463 \
1464 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1465     const int A=(8-x)*(8-y);\
1466     const int B=(  x)*(8-y);\
1467     const int C=(8-x)*(  y);\
1468     const int D=(  x)*(  y);\
1469     int i;\
1470     \
1471     assert(x<8 && y<8 && x>=0 && y>=0);\
1472 \
1473     if(D){\
1474         for(i=0; i<h; i++)\
1475         {\
1476             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1477             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1478             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1479             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1480             dst+= stride;\
1481             src+= stride;\
1482         }\
1483     }else{\
1484         const int E= B+C;\
1485         const int step= C ? stride : 1;\
1486         for(i=0; i<h; i++)\
1487         {\
1488             OP(dst[0], (A*src[0] + E*src[step+0]));\
1489             OP(dst[1], (A*src[1] + E*src[step+1]));\
1490             OP(dst[2], (A*src[2] + E*src[step+2]));\
1491             OP(dst[3], (A*src[3] + E*src[step+3]));\
1492             dst+= stride;\
1493             src+= stride;\
1494         }\
1495     }\
1496 }\
1497 \
1498 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1499     const int A=(8-x)*(8-y);\
1500     const int B=(  x)*(8-y);\
1501     const int C=(8-x)*(  y);\
1502     const int D=(  x)*(  y);\
1503     int i;\
1504     \
1505     assert(x<8 && y<8 && x>=0 && y>=0);\
1506 \
1507     if(D){\
1508         for(i=0; i<h; i++)\
1509         {\
1510             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1511             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1512             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1513             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1514             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1515             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1516             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1517             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1518             dst+= stride;\
1519             src+= stride;\
1520         }\
1521     }else{\
1522         const int E= B+C;\
1523         const int step= C ? stride : 1;\
1524         for(i=0; i<h; i++)\
1525         {\
1526             OP(dst[0], (A*src[0] + E*src[step+0]));\
1527             OP(dst[1], (A*src[1] + E*src[step+1]));\
1528             OP(dst[2], (A*src[2] + E*src[step+2]));\
1529             OP(dst[3], (A*src[3] + E*src[step+3]));\
1530             OP(dst[4], (A*src[4] + E*src[step+4]));\
1531             OP(dst[5], (A*src[5] + E*src[step+5]));\
1532             OP(dst[6], (A*src[6] + E*src[step+6]));\
1533             OP(dst[7], (A*src[7] + E*src[step+7]));\
1534             dst+= stride;\
1535             src+= stride;\
1536         }\
1537     }\
1538 }
1539
1540 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1541 #define op_put(a, b) a = (((b) + 32)>>6)
1542
1543 H264_CHROMA_MC(put_       , op_put)
1544 H264_CHROMA_MC(avg_       , op_avg)
1545 #undef op_avg
1546 #undef op_put
1547
1548 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1549     const int A=(8-x)*(8-y);
1550     const int B=(  x)*(8-y);
1551     const int C=(8-x)*(  y);
1552     const int D=(  x)*(  y);
1553     int i;
1554
1555     assert(x<8 && y<8 && x>=0 && y>=0);
1556
1557     for(i=0; i<h; i++)
1558     {
1559         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1560         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1561         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1562         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1563         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1564         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1565         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1566         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1567         dst+= stride;
1568         src+= stride;
1569     }
1570 }
1571
1572 #define QPEL_MC(r, OPNAME, RND, OP) \
1573 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1574     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1575     int i;\
1576     for(i=0; i<h; i++)\
1577     {\
1578         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1579         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1580         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1581         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1582         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1583         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1584         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1585         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1586         dst+=dstStride;\
1587         src+=srcStride;\
1588     }\
1589 }\
1590 \
1591 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1592     const int w=8;\
1593     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1594     int i;\
1595     for(i=0; i<w; i++)\
1596     {\
1597         const int src0= src[0*srcStride];\
1598         const int src1= src[1*srcStride];\
1599         const int src2= src[2*srcStride];\
1600         const int src3= src[3*srcStride];\
1601         const int src4= src[4*srcStride];\
1602         const int src5= src[5*srcStride];\
1603         const int src6= src[6*srcStride];\
1604         const int src7= src[7*srcStride];\
1605         const int src8= src[8*srcStride];\
1606         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1607         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1608         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1609         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1610         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1611         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1612         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1613         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1614         dst++;\
1615         src++;\
1616     }\
1617 }\
1618 \
1619 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1620     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1621     int i;\
1622     \
1623     for(i=0; i<h; i++)\
1624     {\
1625         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1626         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1627         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1628         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1629         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1630         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1631         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1632         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1633         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1634         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1635         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1636         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1637         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1638         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1639         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1640         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1641         dst+=dstStride;\
1642         src+=srcStride;\
1643     }\
1644 }\
1645 \
1646 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1647     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1648     int i;\
1649     const int w=16;\
1650     for(i=0; i<w; i++)\
1651     {\
1652         const int src0= src[0*srcStride];\
1653         const int src1= src[1*srcStride];\
1654         const int src2= src[2*srcStride];\
1655         const int src3= src[3*srcStride];\
1656         const int src4= src[4*srcStride];\
1657         const int src5= src[5*srcStride];\
1658         const int src6= src[6*srcStride];\
1659         const int src7= src[7*srcStride];\
1660         const int src8= src[8*srcStride];\
1661         const int src9= src[9*srcStride];\
1662         const int src10= src[10*srcStride];\
1663         const int src11= src[11*srcStride];\
1664         const int src12= src[12*srcStride];\
1665         const int src13= src[13*srcStride];\
1666         const int src14= src[14*srcStride];\
1667         const int src15= src[15*srcStride];\
1668         const int src16= src[16*srcStride];\
1669         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1670         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1671         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1672         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1673         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1674         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1675         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1676         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1677         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1678         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1679         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1680         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1681         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1682         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1683         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1684         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1685         dst++;\
1686         src++;\
1687     }\
1688 }\
1689 \
1690 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1691     OPNAME ## pixels8_c(dst, src, stride, 8);\
1692 }\
1693 \
1694 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1695     uint8_t half[64];\
1696     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1697     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1698 }\
1699 \
1700 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1701     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1702 }\
1703 \
1704 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1705     uint8_t half[64];\
1706     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1707     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1708 }\
1709 \
1710 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1711     uint8_t full[16*9];\
1712     uint8_t half[64];\
1713     copy_block9(full, src, 16, stride, 9);\
1714     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1715     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1716 }\
1717 \
1718 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1719     uint8_t full[16*9];\
1720     copy_block9(full, src, 16, stride, 9);\
1721     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1722 }\
1723 \
1724 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1725     uint8_t full[16*9];\
1726     uint8_t half[64];\
1727     copy_block9(full, src, 16, stride, 9);\
1728     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1729     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1730 }\
1731 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1732     uint8_t full[16*9];\
1733     uint8_t halfH[72];\
1734     uint8_t halfV[64];\
1735     uint8_t halfHV[64];\
1736     copy_block9(full, src, 16, stride, 9);\
1737     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1738     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1739     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1740     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1741 }\
1742 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1743     uint8_t full[16*9];\
1744     uint8_t halfH[72];\
1745     uint8_t halfHV[64];\
1746     copy_block9(full, src, 16, stride, 9);\
1747     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1748     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1749     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1750     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1751 }\
1752 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1753     uint8_t full[16*9];\
1754     uint8_t halfH[72];\
1755     uint8_t halfV[64];\
1756     uint8_t halfHV[64];\
1757     copy_block9(full, src, 16, stride, 9);\
1758     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1759     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1760     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1761     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1762 }\
1763 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1764     uint8_t full[16*9];\
1765     uint8_t halfH[72];\
1766     uint8_t halfHV[64];\
1767     copy_block9(full, src, 16, stride, 9);\
1768     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1769     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1770     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1771     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1772 }\
1773 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1774     uint8_t full[16*9];\
1775     uint8_t halfH[72];\
1776     uint8_t halfV[64];\
1777     uint8_t halfHV[64];\
1778     copy_block9(full, src, 16, stride, 9);\
1779     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1780     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1781     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1782     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1783 }\
1784 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1785     uint8_t full[16*9];\
1786     uint8_t halfH[72];\
1787     uint8_t halfHV[64];\
1788     copy_block9(full, src, 16, stride, 9);\
1789     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1790     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1791     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1792     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1793 }\
1794 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1795     uint8_t full[16*9];\
1796     uint8_t halfH[72];\
1797     uint8_t halfV[64];\
1798     uint8_t halfHV[64];\
1799     copy_block9(full, src, 16, stride, 9);\
1800     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1801     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1802     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1803     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1804 }\
1805 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1806     uint8_t full[16*9];\
1807     uint8_t halfH[72];\
1808     uint8_t halfHV[64];\
1809     copy_block9(full, src, 16, stride, 9);\
1810     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1811     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1812     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1813     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1814 }\
1815 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1816     uint8_t halfH[72];\
1817     uint8_t halfHV[64];\
1818     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1819     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1820     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1821 }\
1822 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1823     uint8_t halfH[72];\
1824     uint8_t halfHV[64];\
1825     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1826     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1827     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1828 }\
1829 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1830     uint8_t full[16*9];\
1831     uint8_t halfH[72];\
1832     uint8_t halfV[64];\
1833     uint8_t halfHV[64];\
1834     copy_block9(full, src, 16, stride, 9);\
1835     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1836     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1837     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1838     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1839 }\
1840 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1841     uint8_t full[16*9];\
1842     uint8_t halfH[72];\
1843     copy_block9(full, src, 16, stride, 9);\
1844     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1845     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1846     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1847 }\
1848 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1849     uint8_t full[16*9];\
1850     uint8_t halfH[72];\
1851     uint8_t halfV[64];\
1852     uint8_t halfHV[64];\
1853     copy_block9(full, src, 16, stride, 9);\
1854     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1855     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1856     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1857     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1858 }\
1859 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1860     uint8_t full[16*9];\
1861     uint8_t halfH[72];\
1862     copy_block9(full, src, 16, stride, 9);\
1863     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1864     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1865     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1866 }\
1867 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1868     uint8_t halfH[72];\
1869     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1870     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1871 }\
1872 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1873     OPNAME ## pixels16_c(dst, src, stride, 16);\
1874 }\
1875 \
1876 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1877     uint8_t half[256];\
1878     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1879     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1880 }\
1881 \
1882 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1883     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1884 }\
1885 \
1886 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1887     uint8_t half[256];\
1888     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1889     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1890 }\
1891 \
1892 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1893     uint8_t full[24*17];\
1894     uint8_t half[256];\
1895     copy_block17(full, src, 24, stride, 17);\
1896     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1897     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1898 }\
1899 \
1900 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1901     uint8_t full[24*17];\
1902     copy_block17(full, src, 24, stride, 17);\
1903     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1904 }\
1905 \
1906 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1907     uint8_t full[24*17];\
1908     uint8_t half[256];\
1909     copy_block17(full, src, 24, stride, 17);\
1910     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1911     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1912 }\
1913 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1914     uint8_t full[24*17];\
1915     uint8_t halfH[272];\
1916     uint8_t halfV[256];\
1917     uint8_t halfHV[256];\
1918     copy_block17(full, src, 24, stride, 17);\
1919     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1920     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1921     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1922     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1923 }\
1924 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1925     uint8_t full[24*17];\
1926     uint8_t halfH[272];\
1927     uint8_t halfHV[256];\
1928     copy_block17(full, src, 24, stride, 17);\
1929     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1930     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1931     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1932     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1933 }\
1934 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1935     uint8_t full[24*17];\
1936     uint8_t halfH[272];\
1937     uint8_t halfV[256];\
1938     uint8_t halfHV[256];\
1939     copy_block17(full, src, 24, stride, 17);\
1940     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1941     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1942     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1943     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1944 }\
1945 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1946     uint8_t full[24*17];\
1947     uint8_t halfH[272];\
1948     uint8_t halfHV[256];\
1949     copy_block17(full, src, 24, stride, 17);\
1950     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1951     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1952     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1953     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1954 }\
1955 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956     uint8_t full[24*17];\
1957     uint8_t halfH[272];\
1958     uint8_t halfV[256];\
1959     uint8_t halfHV[256];\
1960     copy_block17(full, src, 24, stride, 17);\
1961     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1962     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1963     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1964     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1965 }\
1966 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1967     uint8_t full[24*17];\
1968     uint8_t halfH[272];\
1969     uint8_t halfHV[256];\
1970     copy_block17(full, src, 24, stride, 17);\
1971     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1972     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1973     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1974     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1975 }\
1976 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1977     uint8_t full[24*17];\
1978     uint8_t halfH[272];\
1979     uint8_t halfV[256];\
1980     uint8_t halfHV[256];\
1981     copy_block17(full, src, 24, stride, 17);\
1982     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1983     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1984     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1985     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1986 }\
1987 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1988     uint8_t full[24*17];\
1989     uint8_t halfH[272];\
1990     uint8_t halfHV[256];\
1991     copy_block17(full, src, 24, stride, 17);\
1992     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1993     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1994     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1995     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1996 }\
1997 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1998     uint8_t halfH[272];\
1999     uint8_t halfHV[256];\
2000     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2001     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2002     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2003 }\
2004 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2005     uint8_t halfH[272];\
2006     uint8_t halfHV[256];\
2007     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2008     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2009     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2010 }\
2011 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2012     uint8_t full[24*17];\
2013     uint8_t halfH[272];\
2014     uint8_t halfV[256];\
2015     uint8_t halfHV[256];\
2016     copy_block17(full, src, 24, stride, 17);\
2017     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2018     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2019     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2020     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2021 }\
2022 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2023     uint8_t full[24*17];\
2024     uint8_t halfH[272];\
2025     copy_block17(full, src, 24, stride, 17);\
2026     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2027     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2028     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2029 }\
2030 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2031     uint8_t full[24*17];\
2032     uint8_t halfH[272];\
2033     uint8_t halfV[256];\
2034     uint8_t halfHV[256];\
2035     copy_block17(full, src, 24, stride, 17);\
2036     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2037     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2038     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2039     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2040 }\
2041 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2042     uint8_t full[24*17];\
2043     uint8_t halfH[272];\
2044     copy_block17(full, src, 24, stride, 17);\
2045     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2046     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2047     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2048 }\
2049 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2050     uint8_t halfH[272];\
2051     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2052     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2053 }
2054
2055 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2056 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2057 #define op_put(a, b) a = cm[((b) + 16)>>5]
2058 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2059
2060 QPEL_MC(0, put_       , _       , op_put)
2061 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2062 QPEL_MC(0, avg_       , _       , op_avg)
2063 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2064 #undef op_avg
2065 #undef op_avg_no_rnd
2066 #undef op_put
2067 #undef op_put_no_rnd
2068
2069 #if 1
2070 #define H264_LOWPASS(OPNAME, OP, OP2) \
2071 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2072     const int h=2;\
2073     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2074     int i;\
2075     for(i=0; i<h; i++)\
2076     {\
2077         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2078         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2079         dst+=dstStride;\
2080         src+=srcStride;\
2081     }\
2082 }\
2083 \
2084 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2085     const int w=2;\
2086     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2087     int i;\
2088     for(i=0; i<w; i++)\
2089     {\
2090         const int srcB= src[-2*srcStride];\
2091         const int srcA= src[-1*srcStride];\
2092         const int src0= src[0 *srcStride];\
2093         const int src1= src[1 *srcStride];\
2094         const int src2= src[2 *srcStride];\
2095         const int src3= src[3 *srcStride];\
2096         const int src4= src[4 *srcStride];\
2097         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2098         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2099         dst++;\
2100         src++;\
2101     }\
2102 }\
2103 \
2104 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2105     const int h=2;\
2106     const int w=2;\
2107     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2108     int i;\
2109     src -= 2*srcStride;\
2110     for(i=0; i<h+5; i++)\
2111     {\
2112         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2113         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2114         tmp+=tmpStride;\
2115         src+=srcStride;\
2116     }\
2117     tmp -= tmpStride*(h+5-2);\
2118     for(i=0; i<w; i++)\
2119     {\
2120         const int tmpB= tmp[-2*tmpStride];\
2121         const int tmpA= tmp[-1*tmpStride];\
2122         const int tmp0= tmp[0 *tmpStride];\
2123         const int tmp1= tmp[1 *tmpStride];\
2124         const int tmp2= tmp[2 *tmpStride];\
2125         const int tmp3= tmp[3 *tmpStride];\
2126         const int tmp4= tmp[4 *tmpStride];\
2127         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2128         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2129         dst++;\
2130         tmp++;\
2131     }\
2132 }\
2133 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2134     const int h=4;\
2135     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2136     int i;\
2137     for(i=0; i<h; i++)\
2138     {\
2139         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2140         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2141         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2142         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2143         dst+=dstStride;\
2144         src+=srcStride;\
2145     }\
2146 }\
2147 \
2148 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2149     const int w=4;\
2150     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2151     int i;\
2152     for(i=0; i<w; i++)\
2153     {\
2154         const int srcB= src[-2*srcStride];\
2155         const int srcA= src[-1*srcStride];\
2156         const int src0= src[0 *srcStride];\
2157         const int src1= src[1 *srcStride];\
2158         const int src2= src[2 *srcStride];\
2159         const int src3= src[3 *srcStride];\
2160         const int src4= src[4 *srcStride];\
2161         const int src5= src[5 *srcStride];\
2162         const int src6= src[6 *srcStride];\
2163         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2164         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2165         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2166         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2167         dst++;\
2168         src++;\
2169     }\
2170 }\
2171 \
2172 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2173     const int h=4;\
2174     const int w=4;\
2175     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2176     int i;\
2177     src -= 2*srcStride;\
2178     for(i=0; i<h+5; i++)\
2179     {\
2180         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2181         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2182         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2183         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2184         tmp+=tmpStride;\
2185         src+=srcStride;\
2186     }\
2187     tmp -= tmpStride*(h+5-2);\
2188     for(i=0; i<w; i++)\
2189     {\
2190         const int tmpB= tmp[-2*tmpStride];\
2191         const int tmpA= tmp[-1*tmpStride];\
2192         const int tmp0= tmp[0 *tmpStride];\
2193         const int tmp1= tmp[1 *tmpStride];\
2194         const int tmp2= tmp[2 *tmpStride];\
2195         const int tmp3= tmp[3 *tmpStride];\
2196         const int tmp4= tmp[4 *tmpStride];\
2197         const int tmp5= tmp[5 *tmpStride];\
2198         const int tmp6= tmp[6 *tmpStride];\
2199         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2200         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2201         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2202         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2203         dst++;\
2204         tmp++;\
2205     }\
2206 }\
2207 \
2208 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2209     const int h=8;\
2210     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2211     int i;\
2212     for(i=0; i<h; i++)\
2213     {\
2214         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2215         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2216         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2217         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2218         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2219         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2220         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2221         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2222         dst+=dstStride;\
2223         src+=srcStride;\
2224     }\
2225 }\
2226 \
2227 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2228     const int w=8;\
2229     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2230     int i;\
2231     for(i=0; i<w; i++)\
2232     {\
2233         const int srcB= src[-2*srcStride];\
2234         const int srcA= src[-1*srcStride];\
2235         const int src0= src[0 *srcStride];\
2236         const int src1= src[1 *srcStride];\
2237         const int src2= src[2 *srcStride];\
2238         const int src3= src[3 *srcStride];\
2239         const int src4= src[4 *srcStride];\
2240         const int src5= src[5 *srcStride];\
2241         const int src6= src[6 *srcStride];\
2242         const int src7= src[7 *srcStride];\
2243         const int src8= src[8 *srcStride];\
2244         const int src9= src[9 *srcStride];\
2245         const int src10=src[10*srcStride];\
2246         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2247         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2248         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2249         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2250         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2251         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2252         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2253         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2254         dst++;\
2255         src++;\
2256     }\
2257 }\
2258 \
2259 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2260     const int h=8;\
2261     const int w=8;\
2262     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2263     int i;\
2264     src -= 2*srcStride;\
2265     for(i=0; i<h+5; i++)\
2266     {\
2267         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2268         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2269         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2270         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2271         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2272         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2273         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2274         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2275         tmp+=tmpStride;\
2276         src+=srcStride;\
2277     }\
2278     tmp -= tmpStride*(h+5-2);\
2279     for(i=0; i<w; i++)\
2280     {\
2281         const int tmpB= tmp[-2*tmpStride];\
2282         const int tmpA= tmp[-1*tmpStride];\
2283         const int tmp0= tmp[0 *tmpStride];\
2284         const int tmp1= tmp[1 *tmpStride];\
2285         const int tmp2= tmp[2 *tmpStride];\
2286         const int tmp3= tmp[3 *tmpStride];\
2287         const int tmp4= tmp[4 *tmpStride];\
2288         const int tmp5= tmp[5 *tmpStride];\
2289         const int tmp6= tmp[6 *tmpStride];\
2290         const int tmp7= tmp[7 *tmpStride];\
2291         const int tmp8= tmp[8 *tmpStride];\
2292         const int tmp9= tmp[9 *tmpStride];\
2293         const int tmp10=tmp[10*tmpStride];\
2294         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2295         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2296         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2297         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2298         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2299         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2300         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2301         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2302         dst++;\
2303         tmp++;\
2304     }\
2305 }\
2306 \
2307 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2308     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2309     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2310     src += 8*srcStride;\
2311     dst += 8*dstStride;\
2312     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2313     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2314 }\
2315 \
2316 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2317     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2318     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2319     src += 8*srcStride;\
2320     dst += 8*dstStride;\
2321     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2322     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2323 }\
2324 \
2325 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2326     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2327     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2328     src += 8*srcStride;\
2329     dst += 8*dstStride;\
2330     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2331     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2332 }\
2333
2334 #define H264_MC(OPNAME, SIZE) \
2335 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2336     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2337 }\
2338 \
2339 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2340     uint8_t half[SIZE*SIZE];\
2341     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2342     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2343 }\
2344 \
2345 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2346     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2347 }\
2348 \
2349 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2350     uint8_t half[SIZE*SIZE];\
2351     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2352     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2353 }\
2354 \
2355 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2356     uint8_t full[SIZE*(SIZE+5)];\
2357     uint8_t * const full_mid= full + SIZE*2;\
2358     uint8_t half[SIZE*SIZE];\
2359     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2360     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2361     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2362 }\
2363 \
2364 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2365     uint8_t full[SIZE*(SIZE+5)];\
2366     uint8_t * const full_mid= full + SIZE*2;\
2367     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2368     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2369 }\
2370 \
2371 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2372     uint8_t full[SIZE*(SIZE+5)];\
2373     uint8_t * const full_mid= full + SIZE*2;\
2374     uint8_t half[SIZE*SIZE];\
2375     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2376     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2377     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2378 }\
2379 \
2380 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2381     uint8_t full[SIZE*(SIZE+5)];\
2382     uint8_t * const full_mid= full + SIZE*2;\
2383     uint8_t halfH[SIZE*SIZE];\
2384     uint8_t halfV[SIZE*SIZE];\
2385     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2386     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2387     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2388     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2389 }\
2390 \
2391 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2392     uint8_t full[SIZE*(SIZE+5)];\
2393     uint8_t * const full_mid= full + SIZE*2;\
2394     uint8_t halfH[SIZE*SIZE];\
2395     uint8_t halfV[SIZE*SIZE];\
2396     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2397     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2398     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2399     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2400 }\
2401 \
2402 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2403     uint8_t full[SIZE*(SIZE+5)];\
2404     uint8_t * const full_mid= full + SIZE*2;\
2405     uint8_t halfH[SIZE*SIZE];\
2406     uint8_t halfV[SIZE*SIZE];\
2407     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2408     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2409     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2410     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2411 }\
2412 \
2413 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2414     uint8_t full[SIZE*(SIZE+5)];\
2415     uint8_t * const full_mid= full + SIZE*2;\
2416     uint8_t halfH[SIZE*SIZE];\
2417     uint8_t halfV[SIZE*SIZE];\
2418     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2419     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2420     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2421     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2422 }\
2423 \
2424 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2425     int16_t tmp[SIZE*(SIZE+5)];\
2426     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2427 }\
2428 \
2429 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2430     int16_t tmp[SIZE*(SIZE+5)];\
2431     uint8_t halfH[SIZE*SIZE];\
2432     uint8_t halfHV[SIZE*SIZE];\
2433     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2434     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2435     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2436 }\
2437 \
2438 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2439     int16_t tmp[SIZE*(SIZE+5)];\
2440     uint8_t halfH[SIZE*SIZE];\
2441     uint8_t halfHV[SIZE*SIZE];\
2442     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2443     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2444     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2445 }\
2446 \
2447 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2448     uint8_t full[SIZE*(SIZE+5)];\
2449     uint8_t * const full_mid= full + SIZE*2;\
2450     int16_t tmp[SIZE*(SIZE+5)];\
2451     uint8_t halfV[SIZE*SIZE];\
2452     uint8_t halfHV[SIZE*SIZE];\
2453     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2454     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2455     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2456     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2457 }\
2458 \
2459 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2460     uint8_t full[SIZE*(SIZE+5)];\
2461     uint8_t * const full_mid= full + SIZE*2;\
2462     int16_t tmp[SIZE*(SIZE+5)];\
2463     uint8_t halfV[SIZE*SIZE];\
2464     uint8_t halfHV[SIZE*SIZE];\
2465     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2466     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2467     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2468     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2469 }\
2470
2471 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2472 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2473 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2474 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2475 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2476
2477 H264_LOWPASS(put_       , op_put, op2_put)
2478 H264_LOWPASS(avg_       , op_avg, op2_avg)
2479 H264_MC(put_, 2)
2480 H264_MC(put_, 4)
2481 H264_MC(put_, 8)
2482 H264_MC(put_, 16)
2483 H264_MC(avg_, 4)
2484 H264_MC(avg_, 8)
2485 H264_MC(avg_, 16)
2486
2487 #undef op_avg
2488 #undef op_put
2489 #undef op2_avg
2490 #undef op2_put
2491 #endif
2492
2493 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2494 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2495 #define H264_WEIGHT(W,H) \
2496 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2497     int y; \
2498     offset <<= log2_denom; \
2499     if(log2_denom) offset += 1<<(log2_denom-1); \
2500     for(y=0; y<H; y++, block += stride){ \
2501         op_scale1(0); \
2502         op_scale1(1); \
2503         if(W==2) continue; \
2504         op_scale1(2); \
2505         op_scale1(3); \
2506         if(W==4) continue; \
2507         op_scale1(4); \
2508         op_scale1(5); \
2509         op_scale1(6); \
2510         op_scale1(7); \
2511         if(W==8) continue; \
2512         op_scale1(8); \
2513         op_scale1(9); \
2514         op_scale1(10); \
2515         op_scale1(11); \
2516         op_scale1(12); \
2517         op_scale1(13); \
2518         op_scale1(14); \
2519         op_scale1(15); \
2520     } \
2521 } \
2522 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2523     int y; \
2524     offset = ((offset + 1) | 1) << log2_denom; \
2525     for(y=0; y<H; y++, dst += stride, src += stride){ \
2526         op_scale2(0); \
2527         op_scale2(1); \
2528         if(W==2) continue; \
2529         op_scale2(2); \
2530         op_scale2(3); \
2531         if(W==4) continue; \
2532         op_scale2(4); \
2533         op_scale2(5); \
2534         op_scale2(6); \
2535         op_scale2(7); \
2536         if(W==8) continue; \
2537         op_scale2(8); \
2538         op_scale2(9); \
2539         op_scale2(10); \
2540         op_scale2(11); \
2541         op_scale2(12); \
2542         op_scale2(13); \
2543         op_scale2(14); \
2544         op_scale2(15); \
2545     } \
2546 }
2547
2548 H264_WEIGHT(16,16)
2549 H264_WEIGHT(16,8)
2550 H264_WEIGHT(8,16)
2551 H264_WEIGHT(8,8)
2552 H264_WEIGHT(8,4)
2553 H264_WEIGHT(4,8)
2554 H264_WEIGHT(4,4)
2555 H264_WEIGHT(4,2)
2556 H264_WEIGHT(2,4)
2557 H264_WEIGHT(2,2)
2558
2559 #undef op_scale1
2560 #undef op_scale2
2561 #undef H264_WEIGHT
2562
2563 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2564     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2565     int i;
2566
2567     for(i=0; i<h; i++){
2568         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2569         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2570         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2571         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2572         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2573         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2574         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2575         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2576         dst+=dstStride;
2577         src+=srcStride;
2578     }
2579 }
2580
2581 #ifdef CONFIG_CAVS_DECODER
2582 /* AVS specific */
2583 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2584
2585 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2586     put_pixels8_c(dst, src, stride, 8);
2587 }
2588 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2589     avg_pixels8_c(dst, src, stride, 8);
2590 }
2591 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2592     put_pixels16_c(dst, src, stride, 16);
2593 }
2594 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2595     avg_pixels16_c(dst, src, stride, 16);
2596 }
2597 #endif /* CONFIG_CAVS_DECODER */
2598
2599 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2600 /* VC-1 specific */
2601 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2602
2603 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2604     put_pixels8_c(dst, src, stride, 8);
2605 }
2606 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2607
2608 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2609
2610 /* H264 specific */
2611 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2612
2613 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2614     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2615     int i;
2616
2617     for(i=0; i<w; i++){
2618         const int src_1= src[ -srcStride];
2619         const int src0 = src[0          ];
2620         const int src1 = src[  srcStride];
2621         const int src2 = src[2*srcStride];
2622         const int src3 = src[3*srcStride];
2623         const int src4 = src[4*srcStride];
2624         const int src5 = src[5*srcStride];
2625         const int src6 = src[6*srcStride];
2626         const int src7 = src[7*srcStride];
2627         const int src8 = src[8*srcStride];
2628         const int src9 = src[9*srcStride];
2629         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2630         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2631         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2632         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2633         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2634         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2635         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2636         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2637         src++;
2638         dst++;
2639     }
2640 }
2641
2642 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2643     put_pixels8_c(dst, src, stride, 8);
2644 }
2645
2646 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2647     uint8_t half[64];
2648     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2649     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2650 }
2651
2652 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2653     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2654 }
2655
2656 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2657     uint8_t half[64];
2658     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2659     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2660 }
2661
2662 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2663     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2664 }
2665
2666 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2667     uint8_t halfH[88];
2668     uint8_t halfV[64];
2669     uint8_t halfHV[64];
2670     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2671     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2672     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2673     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2674 }
2675 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2676     uint8_t halfH[88];
2677     uint8_t halfV[64];
2678     uint8_t halfHV[64];
2679     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2680     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2681     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2682     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2683 }
2684 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2685     uint8_t halfH[88];
2686     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2687     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2688 }
2689
2690 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2691     if(ENABLE_ANY_H263) {
2692     int x;
2693     const int strength= ff_h263_loop_filter_strength[qscale];
2694
2695     for(x=0; x<8; x++){
2696         int d1, d2, ad1;
2697         int p0= src[x-2*stride];
2698         int p1= src[x-1*stride];
2699         int p2= src[x+0*stride];
2700         int p3= src[x+1*stride];
2701         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2702
2703         if     (d<-2*strength) d1= 0;
2704         else if(d<-  strength) d1=-2*strength - d;
2705         else if(d<   strength) d1= d;
2706         else if(d< 2*strength) d1= 2*strength - d;
2707         else                   d1= 0;
2708
2709         p1 += d1;
2710         p2 -= d1;
2711         if(p1&256) p1= ~(p1>>31);
2712         if(p2&256) p2= ~(p2>>31);
2713
2714         src[x-1*stride] = p1;
2715         src[x+0*stride] = p2;
2716
2717         ad1= FFABS(d1)>>1;
2718
2719         d2= av_clip((p0-p3)/4, -ad1, ad1);
2720
2721         src[x-2*stride] = p0 - d2;
2722         src[x+  stride] = p3 + d2;
2723     }
2724     }
2725 }
2726
2727 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2728     if(ENABLE_ANY_H263) {
2729     int y;
2730     const int strength= ff_h263_loop_filter_strength[qscale];
2731
2732     for(y=0; y<8; y++){
2733         int d1, d2, ad1;
2734         int p0= src[y*stride-2];
2735         int p1= src[y*stride-1];
2736         int p2= src[y*stride+0];
2737         int p3= src[y*stride+1];
2738         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2739
2740         if     (d<-2*strength) d1= 0;
2741         else if(d<-  strength) d1=-2*strength - d;
2742         else if(d<   strength) d1= d;
2743         else if(d< 2*strength) d1= 2*strength - d;
2744         else                   d1= 0;
2745
2746         p1 += d1;
2747         p2 -= d1;
2748         if(p1&256) p1= ~(p1>>31);
2749         if(p2&256) p2= ~(p2>>31);
2750
2751         src[y*stride-1] = p1;
2752         src[y*stride+0] = p2;
2753
2754         ad1= FFABS(d1)>>1;
2755
2756         d2= av_clip((p0-p3)/4, -ad1, ad1);
2757
2758         src[y*stride-2] = p0 - d2;
2759         src[y*stride+1] = p3 + d2;
2760     }
2761     }
2762 }
2763
2764 static void h261_loop_filter_c(uint8_t *src, int stride){
2765     int x,y,xy,yz;
2766     int temp[64];
2767
2768     for(x=0; x<8; x++){
2769         temp[x      ] = 4*src[x           ];
2770         temp[x + 7*8] = 4*src[x + 7*stride];
2771     }
2772     for(y=1; y<7; y++){
2773         for(x=0; x<8; x++){
2774             xy = y * stride + x;
2775             yz = y * 8 + x;
2776             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2777         }
2778     }
2779
2780     for(y=0; y<8; y++){
2781         src[  y*stride] = (temp[  y*8] + 2)>>2;
2782         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2783         for(x=1; x<7; x++){
2784             xy = y * stride + x;
2785             yz = y * 8 + x;
2786             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2787         }
2788     }
2789 }
2790
2791 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2792 {
2793     int i, d;
2794     for( i = 0; i < 4; i++ ) {
2795         if( tc0[i] < 0 ) {
2796             pix += 4*ystride;
2797             continue;
2798         }
2799         for( d = 0; d < 4; d++ ) {
2800             const int p0 = pix[-1*xstride];
2801             const int p1 = pix[-2*xstride];
2802             const int p2 = pix[-3*xstride];
2803             const int q0 = pix[0];
2804             const int q1 = pix[1*xstride];
2805             const int q2 = pix[2*xstride];
2806
2807             if( FFABS( p0 - q0 ) < alpha &&
2808                 FFABS( p1 - p0 ) < beta &&
2809                 FFABS( q1 - q0 ) < beta ) {
2810
2811                 int tc = tc0[i];
2812                 int i_delta;
2813
2814                 if( FFABS( p2 - p0 ) < beta ) {
2815                     pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2816                     tc++;
2817                 }
2818                 if( FFABS( q2 - q0 ) < beta ) {
2819                     pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2820                     tc++;
2821                 }
2822
2823                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2824                 pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2825                 pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2826             }
2827             pix += ystride;
2828         }
2829     }
2830 }
2831 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2832 {
2833     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2834 }
2835 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2836 {
2837     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2838 }
2839
2840 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2841 {
2842     int i, d;
2843     for( i = 0; i < 4; i++ ) {
2844         const int tc = tc0[i];
2845         if( tc <= 0 ) {
2846             pix += 2*ystride;
2847             continue;
2848         }
2849         for( d = 0; d < 2; d++ ) {
2850             const int p0 = pix[-1*xstride];
2851             const int p1 = pix[-2*xstride];
2852             const int q0 = pix[0];
2853             const int q1 = pix[1*xstride];
2854
2855             if( FFABS( p0 - q0 ) < alpha &&
2856                 FFABS( p1 - p0 ) < beta &&
2857                 FFABS( q1 - q0 ) < beta ) {
2858
2859                 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2860
2861                 pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2862                 pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2863             }
2864             pix += ystride;
2865         }
2866     }
2867 }
2868 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2869 {
2870     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2871 }
2872 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2873 {
2874     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2875 }
2876
2877 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2878 {
2879     int d;
2880     for( d = 0; d < 8; d++ ) {
2881         const int p0 = pix[-1*xstride];
2882         const int p1 = pix[-2*xstride];
2883         const int q0 = pix[0];
2884         const int q1 = pix[1*xstride];
2885
2886         if( FFABS( p0 - q0 ) < alpha &&
2887             FFABS( p1 - p0 ) < beta &&
2888             FFABS( q1 - q0 ) < beta ) {
2889
2890             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2891             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2892         }
2893         pix += ystride;
2894     }
2895 }
2896 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2897 {
2898     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2899 }
2900 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2901 {
2902     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2903 }
2904
2905 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2906 {
2907     int s, i;
2908
2909     s = 0;
2910     for(i=0;i<h;i++) {
2911         s += abs(pix1[0] - pix2[0]);
2912         s += abs(pix1[1] - pix2[1]);
2913         s += abs(pix1[2] - pix2[2]);
2914         s += abs(pix1[3] - pix2[3]);
2915         s += abs(pix1[4] - pix2[4]);
2916         s += abs(pix1[5] - pix2[5]);
2917         s += abs(pix1[6] - pix2[6]);
2918         s += abs(pix1[7] - pix2[7]);
2919         s += abs(pix1[8] - pix2[8]);
2920         s += abs(pix1[9] - pix2[9]);
2921         s += abs(pix1[10] - pix2[10]);
2922         s += abs(pix1[11] - pix2[11]);
2923         s += abs(pix1[12] - pix2[12]);
2924         s += abs(pix1[13] - pix2[13]);
2925         s += abs(pix1[14] - pix2[14]);
2926         s += abs(pix1[15] - pix2[15]);
2927         pix1 += line_size;
2928         pix2 += line_size;
2929     }
2930     return s;
2931 }
2932
2933 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2934 {
2935     int s, i;
2936
2937     s = 0;
2938     for(i=0;i<h;i++) {
2939         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2940         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2941         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2942         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2943         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2944         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2945         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2946         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2947         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2948         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2949         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2950         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2951         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2952         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2953         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2954         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2955         pix1 += line_size;
2956         pix2 += line_size;
2957     }
2958     return s;
2959 }
2960
2961 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2962 {
2963     int s, i;
2964     uint8_t *pix3 = pix2 + line_size;
2965
2966     s = 0;
2967     for(i=0;i<h;i++) {
2968         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2969         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2970         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2971         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2972         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2973         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2974         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2975         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2976         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2977         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2978         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2979         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2980         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2981         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2982         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2983         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2984         pix1 += line_size;
2985         pix2 += line_size;
2986         pix3 += line_size;
2987     }
2988     return s;
2989 }
2990
2991 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2992 {
2993     int s, i;
2994     uint8_t *pix3 = pix2 + line_size;
2995
2996     s = 0;
2997     for(i=0;i<h;i++) {
2998         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2999         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3000         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3001         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3002         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3003         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3004         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3005         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3006         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3007         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3008         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3009         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3010         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3011         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3012         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3013         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3014         pix1 += line_size;
3015         pix2 += line_size;
3016         pix3 += line_size;
3017     }
3018     return s;
3019 }
3020
3021 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3022 {
3023     int s, i;
3024
3025     s = 0;
3026     for(i=0;i<h;i++) {
3027         s += abs(pix1[0] - pix2[0]);
3028         s += abs(pix1[1] - pix2[1]);
3029         s += abs(pix1[2] - pix2[2]);
3030         s += abs(pix1[3] - pix2[3]);
3031         s += abs(pix1[4] - pix2[4]);
3032         s += abs(pix1[5] - pix2[5]);
3033         s += abs(pix1[6] - pix2[6]);
3034         s += abs(pix1[7] - pix2[7]);
3035         pix1 += line_size;
3036         pix2 += line_size;
3037     }
3038     return s;
3039 }
3040
3041 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3042 {
3043     int s, i;
3044
3045     s = 0;
3046     for(i=0;i<h;i++) {
3047         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3048         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3049         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3050         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3051         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3052         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3053         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3054         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3055         pix1 += line_size;
3056         pix2 += line_size;
3057     }
3058     return s;
3059 }
3060
3061 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3062 {
3063     int s, i;
3064     uint8_t *pix3 = pix2 + line_size;
3065
3066     s = 0;
3067     for(i=0;i<h;i++) {
3068         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3069         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3070         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3071         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3072         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3073         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3074         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3075         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3076         pix1 += line_size;
3077         pix2 += line_size;
3078         pix3 += line_size;
3079     }
3080     return s;
3081 }
3082
3083 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3084 {
3085     int s, i;
3086     uint8_t *pix3 = pix2 + line_size;
3087
3088     s = 0;
3089     for(i=0;i<h;i++) {
3090         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3091         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3092         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3093         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3094         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3095         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3096         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3097         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3098         pix1 += line_size;
3099         pix2 += line_size;
3100         pix3 += line_size;
3101     }
3102     return s;
3103 }
3104
3105 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3106     MpegEncContext *c = v;
3107     int score1=0;
3108     int score2=0;
3109     int x,y;
3110
3111     for(y=0; y<h; y++){
3112         for(x=0; x<16; x++){
3113             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3114         }
3115         if(y+1<h){
3116             for(x=0; x<15; x++){
3117                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3118                              - s1[x+1] + s1[x+1+stride])
3119                         -FFABS(  s2[x  ] - s2[x  +stride]
3120                              - s2[x+1] + s2[x+1+stride]);
3121             }
3122         }
3123         s1+= stride;
3124         s2+= stride;
3125     }
3126
3127     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3128     else  return score1 + FFABS(score2)*8;
3129 }
3130
3131 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3132     MpegEncContext *c = v;
3133     int score1=0;
3134     int score2=0;
3135     int x,y;
3136
3137     for(y=0; y<h; y++){
3138         for(x=0; x<8; x++){
3139             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3140         }
3141         if(y+1<h){
3142             for(x=0; x<7; x++){
3143                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3144                              - s1[x+1] + s1[x+1+stride])
3145                         -FFABS(  s2[x  ] - s2[x  +stride]
3146                              - s2[x+1] + s2[x+1+stride]);
3147             }
3148         }
3149         s1+= stride;
3150         s2+= stride;
3151     }
3152
3153     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3154     else  return score1 + FFABS(score2)*8;
3155 }
3156
3157 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3158     int i;
3159     unsigned int sum=0;
3160
3161     for(i=0; i<8*8; i++){
3162         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3163         int w= weight[i];
3164         b>>= RECON_SHIFT;
3165         assert(-512<b && b<512);
3166
3167         sum += (w*b)*(w*b)>>4;
3168     }
3169     return sum>>2;
3170 }
3171
3172 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3173     int i;
3174
3175     for(i=0; i<8*8; i++){
3176         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3177     }
3178 }
3179
3180 /**
3181  * permutes an 8x8 block.
3182  * @param block the block which will be permuted according to the given permutation vector
3183  * @param permutation the permutation vector
3184  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3185  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3186  *                  (inverse) permutated to scantable order!
3187  */
3188 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3189 {
3190     int i;
3191     DCTELEM temp[64];
3192
3193     if(last<=0) return;
3194     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3195
3196     for(i=0; i<=last; i++){
3197         const int j= scantable[i];
3198         temp[j]= block[j];
3199         block[j]=0;
3200     }
3201
3202     for(i=0; i<=last; i++){
3203         const int j= scantable[i];
3204         const int perm_j= permutation[j];
3205         block[perm_j]= temp[j];
3206     }
3207 }
3208
3209 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3210     return 0;
3211 }
3212
3213 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3214     int i;
3215
3216     memset(cmp, 0, sizeof(void*)*5);
3217
3218     for(i=0; i<5; i++){
3219         switch(type&0xFF){
3220         case FF_CMP_SAD:
3221             cmp[i]= c->sad[i];
3222             break;
3223         case FF_CMP_SATD:
3224             cmp[i]= c->hadamard8_diff[i];
3225             break;
3226         case FF_CMP_SSE:
3227             cmp[i]= c->sse[i];
3228             break;
3229         case FF_CMP_DCT:
3230             cmp[i]= c->dct_sad[i];
3231             break;
3232         case FF_CMP_DCT264:
3233             cmp[i]= c->dct264_sad[i];
3234             break;
3235         case FF_CMP_DCTMAX:
3236             cmp[i]= c->dct_max[i];
3237             break;
3238         case FF_CMP_PSNR:
3239             cmp[i]= c->quant_psnr[i];
3240             break;
3241         case FF_CMP_BIT:
3242             cmp[i]= c->bit[i];
3243             break;
3244         case FF_CMP_RD:
3245             cmp[i]= c->rd[i];
3246             break;
3247         case FF_CMP_VSAD:
3248             cmp[i]= c->vsad[i];
3249             break;
3250         case FF_CMP_VSSE:
3251             cmp[i]= c->vsse[i];
3252             break;
3253         case FF_CMP_ZERO:
3254             cmp[i]= zero_cmp;
3255             break;
3256         case FF_CMP_NSSE:
3257             cmp[i]= c->nsse[i];
3258             break;
3259 #ifdef CONFIG_SNOW_ENCODER
3260         case FF_CMP_W53:
3261             cmp[i]= c->w53[i];
3262             break;
3263         case FF_CMP_W97:
3264             cmp[i]= c->w97[i];
3265             break;
3266 #endif
3267         default:
3268             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3269         }
3270     }
3271 }
3272
3273 /**
3274  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3275  */
3276 static void clear_blocks_c(DCTELEM *blocks)
3277 {
3278     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3279 }
3280
3281 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3282     int i;
3283     for(i=0; i+7<w; i+=8){
3284         dst[i+0] += src[i+0];
3285         dst[i+1] += src[i+1];
3286         dst[i+2] += src[i+2];
3287         dst[i+3] += src[i+3];
3288         dst[i+4] += src[i+4];
3289         dst[i+5] += src[i+5];
3290         dst[i+6] += src[i+6];
3291         dst[i+7] += src[i+7];
3292     }
3293     for(; i<w; i++)
3294         dst[i+0] += src[i+0];
3295 }
3296
3297 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3298     int i;
3299     for(i=0; i+7<w; i+=8){
3300         dst[i+0] = src1[i+0]-src2[i+0];
3301         dst[i+1] = src1[i+1]-src2[i+1];
3302         dst[i+2] = src1[i+2]-src2[i+2];
3303         dst[i+3] = src1[i+3]-src2[i+3];
3304         dst[i+4] = src1[i+4]-src2[i+4];
3305         dst[i+5] = src1[i+5]-src2[i+5];
3306         dst[i+6] = src1[i+6]-src2[i+6];
3307         dst[i+7] = src1[i+7]-src2[i+7];
3308     }
3309     for(; i<w; i++)
3310         dst[i+0] = src1[i+0]-src2[i+0];
3311 }
3312
3313 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3314     int i;
3315     uint8_t l, lt;
3316
3317     l= *left;
3318     lt= *left_top;
3319
3320     for(i=0; i<w; i++){
3321         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3322         lt= src1[i];
3323         l= src2[i];
3324         dst[i]= l - pred;
3325     }
3326
3327     *left= l;
3328     *left_top= lt;
3329 }
3330
3331 #define BUTTERFLY2(o1,o2,i1,i2) \
3332 o1= (i1)+(i2);\
3333 o2= (i1)-(i2);
3334
3335 #define BUTTERFLY1(x,y) \
3336 {\
3337     int a,b;\
3338     a= x;\
3339     b= y;\
3340     x= a+b;\
3341     y= a-b;\
3342 }
3343
3344 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3345
3346 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3347     int i;
3348     int temp[64];
3349     int sum=0;
3350
3351     assert(h==8);
3352
3353     for(i=0; i<8; i++){
3354         //FIXME try pointer walks
3355         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3356         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3357         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3358         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3359
3360         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3361         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3362         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3363         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3364
3365         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3366         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3367         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3368         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3369     }
3370
3371     for(i=0; i<8; i++){
3372         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3373         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3374         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3375         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3376
3377         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3378         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3379         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3380         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3381
3382         sum +=
3383              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3384             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3385             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3386             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3387     }
3388 #if 0
3389 static int maxi=0;
3390 if(sum>maxi){
3391     maxi=sum;
3392     printf("MAX:%d\n", maxi);
3393 }
3394 #endif
3395     return sum;
3396 }
3397
3398 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3399     int i;
3400     int temp[64];
3401     int sum=0;
3402
3403     assert(h==8);
3404
3405     for(i=0; i<8; i++){
3406         //FIXME try pointer walks
3407         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3408         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3409         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3410         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3411
3412         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3413         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3414         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3415         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3416
3417         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3418         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3419         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3420         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3421     }
3422
3423     for(i=0; i<8; i++){
3424         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3425         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3426         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3427         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3428
3429         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3430         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3431         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3432         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3433
3434         sum +=
3435              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3436             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3437             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3438             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3439     }
3440
3441     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3442
3443     return sum;
3444 }
3445
3446 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3447     MpegEncContext * const s= (MpegEncContext *)c;
3448     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3449     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3450
3451     assert(h==8);
3452
3453     s->dsp.diff_pixels(temp, src1, src2, stride);
3454     s->dsp.fdct(temp);
3455     return s->dsp.sum_abs_dctelem(temp);
3456 }
3457
3458 #ifdef CONFIG_GPL
3459 #define DCT8_1D {\
3460     const int s07 = SRC(0) + SRC(7);\
3461     const int s16 = SRC(1) + SRC(6);\
3462     const int s25 = SRC(2) + SRC(5);\
3463     const int s34 = SRC(3) + SRC(4);\
3464     const int a0 = s07 + s34;\
3465     const int a1 = s16 + s25;\
3466     const int a2 = s07 - s34;\
3467     const int a3 = s16 - s25;\
3468     const int d07 = SRC(0) - SRC(7);\
3469     const int d16 = SRC(1) - SRC(6);\
3470     const int d25 = SRC(2) - SRC(5);\
3471     const int d34 = SRC(3) - SRC(4);\
3472     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3473     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3474     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3475     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3476     DST(0,  a0 + a1     ) ;\
3477     DST(1,  a4 + (a7>>2)) ;\
3478     DST(2,  a2 + (a3>>1)) ;\
3479     DST(3,  a5 + (a6>>2)) ;\
3480     DST(4,  a0 - a1     ) ;\
3481     DST(5,  a6 - (a5>>2)) ;\
3482     DST(6, (a2>>1) - a3 ) ;\
3483     DST(7, (a4>>2) - a7 ) ;\
3484 }
3485
3486 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3487     MpegEncContext * const s= (MpegEncContext *)c;
3488     DCTELEM dct[8][8];
3489     int i;
3490     int sum=0;
3491
3492     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3493
3494 #define SRC(x) dct[i][x]
3495 #define DST(x,v) dct[i][x]= v
3496     for( i = 0; i < 8; i++ )
3497         DCT8_1D
3498 #undef SRC
3499 #undef DST
3500
3501 #define SRC(x) dct[x][i]
3502 #define DST(x,v) sum += FFABS(v)
3503     for( i = 0; i < 8; i++ )
3504         DCT8_1D
3505 #undef SRC
3506 #undef DST
3507     return sum;
3508 }
3509 #endif
3510
3511 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3512     MpegEncContext * const s= (MpegEncContext *)c;
3513     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3514     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3515     int sum=0, i;
3516
3517     assert(h==8);
3518
3519     s->dsp.diff_pixels(temp, src1, src2, stride);
3520     s->dsp.fdct(temp);
3521
3522     for(i=0; i<64; i++)
3523         sum= FFMAX(sum, FFABS(temp[i]));
3524
3525     return sum;
3526 }
3527
3528 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3529     MpegEncContext * const s= (MpegEncContext *)c;
3530     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3531     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3532     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3533     int sum=0, i;
3534
3535     assert(h==8);
3536     s->mb_intra=0;
3537
3538     s->dsp.diff_pixels(temp, src1, src2, stride);
3539
3540     memcpy(bak, temp, 64*sizeof(DCTELEM));
3541
3542     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3543     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3544     ff_simple_idct(temp); //FIXME
3545
3546     for(i=0; i<64; i++)
3547         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3548
3549     return sum;
3550 }
3551
3552 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3553     MpegEncContext * const s= (MpegEncContext *)c;
3554     const uint8_t *scantable= s->intra_scantable.permutated;
3555     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3556     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3557     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3558     uint8_t * const bak= (uint8_t*)aligned_bak;
3559     int i, last, run, bits, level, distoration, start_i;
3560     const int esc_length= s->ac_esc_length;
3561     uint8_t * length;
3562     uint8_t * last_length;
3563
3564     assert(h==8);
3565
3566     for(i=0; i<8; i++){
3567         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3568         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3569     }
3570
3571     s->dsp.diff_pixels(temp, src1, src2, stride);
3572
3573     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3574
3575     bits=0;
3576
3577     if (s->mb_intra) {
3578         start_i = 1;
3579         length     = s->intra_ac_vlc_length;
3580         last_length= s->intra_ac_vlc_last_length;
3581         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3582     } else {
3583         start_i = 0;
3584         length     = s->inter_ac_vlc_length;
3585         last_length= s->inter_ac_vlc_last_length;
3586     }
3587
3588     if(last>=start_i){
3589         run=0;
3590         for(i=start_i; i<last; i++){
3591             int j= scantable[i];
3592             level= temp[j];
3593
3594             if(level){
3595                 level+=64;
3596                 if((level&(~127)) == 0){
3597                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3598                 }else
3599                     bits+= esc_length;
3600                 run=0;
3601             }else
3602                 run++;
3603         }
3604         i= scantable[last];
3605
3606         level= temp[i] + 64;
3607
3608         assert(level - 64);
3609
3610         if((level&(~127)) == 0){
3611             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3612         }else
3613             bits+= esc_length;
3614
3615     }
3616
3617     if(last>=0){
3618         if(s->mb_intra)
3619             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3620         else
3621             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3622     }
3623
3624     s->dsp.idct_add(bak, stride, temp);
3625
3626     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3627
3628     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3629 }
3630
3631 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3632     MpegEncContext * const s= (MpegEncContext *)c;
3633     const uint8_t *scantable= s->intra_scantable.permutated;
3634     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3635     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3636     int i, last, run, bits, level, start_i;
3637     const int esc_length= s->ac_esc_length;
3638     uint8_t * length;
3639     uint8_t * last_length;
3640
3641     assert(h==8);
3642
3643     s->dsp.diff_pixels(temp, src1, src2, stride);
3644
3645     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3646
3647     bits=0;
3648
3649     if (s->mb_intra) {
3650         start_i = 1;
3651         length     = s->intra_ac_vlc_length;
3652         last_length= s->intra_ac_vlc_last_length;
3653         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3654     } else {
3655         start_i = 0;
3656         length     = s->inter_ac_vlc_length;
3657         last_length= s->inter_ac_vlc_last_length;
3658     }
3659
3660     if(last>=start_i){
3661         run=0;
3662         for(i=start_i; i<last; i++){
3663             int j= scantable[i];
3664             level= temp[j];
3665
3666             if(level){
3667                 level+=64;
3668                 if((level&(~127)) == 0){
3669                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3670                 }else
3671                     bits+= esc_length;
3672                 run=0;
3673             }else
3674                 run++;
3675         }
3676         i= scantable[last];
3677
3678         level= temp[i] + 64;
3679
3680         assert(level - 64);
3681
3682         if((level&(~127)) == 0){
3683             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3684         }else
3685             bits+= esc_length;
3686     }
3687
3688     return bits;
3689 }
3690
3691 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3692     int score=0;
3693     int x,y;
3694
3695     for(y=1; y<h; y++){
3696         for(x=0; x<16; x+=4){
3697             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3698                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3699         }
3700         s+= stride;
3701     }
3702
3703     return score;
3704 }
3705
3706 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3707     int score=0;
3708     int x,y;
3709
3710     for(y=1; y<h; y++){
3711         for(x=0; x<16; x++){
3712             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3713         }
3714         s1+= stride;
3715         s2+= stride;
3716     }
3717
3718     return score;
3719 }
3720
3721 #define SQ(a) ((a)*(a))
3722 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3723     int score=0;
3724     int x,y;
3725
3726     for(y=1; y<h; y++){
3727         for(x=0; x<16; x+=4){
3728             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3729                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3730         }
3731         s+= stride;
3732     }
3733
3734     return score;
3735 }
3736
3737 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3738     int score=0;
3739     int x,y;
3740
3741     for(y=1; y<h; y++){
3742         for(x=0; x<16; x++){
3743             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3744         }
3745         s1+= stride;
3746         s2+= stride;
3747     }
3748
3749     return score;
3750 }
3751
3752 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3753                                int size){
3754     int score=0;
3755     int i;
3756     for(i=0; i<size; i++)
3757         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3758     return score;
3759 }
3760
3761 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3762 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3763 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3764 #ifdef CONFIG_GPL
3765 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3766 #endif
3767 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3768 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3769 WARPER8_16_SQ(rd8x8_c, rd16_c)
3770 WARPER8_16_SQ(bit8x8_c, bit16_c)
3771
3772 static void vector_fmul_c(float *dst, const float *src, int len){
3773     int i;
3774     for(i=0; i<len; i++)
3775         dst[i] *= src[i];
3776 }
3777
3778 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3779     int i;
3780     src1 += len-1;
3781     for(i=0; i<len; i++)
3782         dst[i] = src0[i] * src1[-i];
3783 }
3784
3785 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3786     int i;
3787     for(i=0; i<len; i++)
3788         dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3789 }
3790
3791 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3792     int i;
3793     for(i=0; i<len; i++) {
3794         int_fast32_t tmp = ((int32_t*)src)[i];
3795         if(tmp & 0xf0000){
3796             tmp = (0x43c0ffff - tmp)>>31;
3797             // is this faster on some gcc/cpu combinations?
3798 //          if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3799 //          else                 tmp = 0;
3800         }
3801         dst[i] = tmp - 0x8000;
3802     }
3803 }
3804
3805 #define W0 2048
3806 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3807 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3808 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3809 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3810 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3811 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3812 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3813
3814 static void wmv2_idct_row(short * b)
3815 {
3816     int s1,s2;
3817     int a0,a1,a2,a3,a4,a5,a6,a7;
3818     /*step 1*/
3819     a1 = W1*b[1]+W7*b[7];
3820     a7 = W7*b[1]-W1*b[7];
3821     a5 = W5*b[5]+W3*b[3];
3822     a3 = W3*b[5]-W5*b[3];
3823     a2 = W2*b[2]+W6*b[6];
3824     a6 = W6*b[2]-W2*b[6];
3825     a0 = W0*b[0]+W0*b[4];
3826     a4 = W0*b[0]-W0*b[4];
3827     /*step 2*/
3828     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3829     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3830     /*step 3*/
3831     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3832     b[1] = (a4+a6 +s1   + (1<<7))>>8;
3833     b[2] = (a4-a6 +s2   + (1<<7))>>8;
3834     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3835     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3836     b[5] = (a4-a6 -s2   + (1<<7))>>8;
3837     b[6] = (a4+a6 -s1   + (1<<7))>>8;
3838     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3839 }
3840 static void wmv2_idct_col(short * b)
3841 {
3842     int s1,s2;
3843     int a0,a1,a2,a3,a4,a5,a6,a7;
3844     /*step 1, with extended precision*/
3845     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3846     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3847     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3848     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3849     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3850     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3851     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
3852     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
3853     /*step 2*/
3854     s1 = (181*(a1-a5+a7-a3)+128)>>8;
3855     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3856     /*step 3*/
3857     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3858     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
3859     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
3860     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3861
3862     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3863     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
3864     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
3865     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3866 }
3867 void ff_wmv2_idct_c(short * block){
3868     int i;
3869
3870     for(i=0;i<64;i+=8){
3871         wmv2_idct_row(block+i);
3872     }
3873     for(i=0;i<8;i++){
3874         wmv2_idct_col(block+i);
3875     }
3876 }
3877 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3878  converted */
3879 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3880 {
3881     ff_wmv2_idct_c(block);
3882     put_pixels_clamped_c(block, dest, line_size);
3883 }
3884 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3885 {
3886     ff_wmv2_idct_c(block);
3887     add_pixels_clamped_c(block, dest, line_size);
3888 }
3889 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3890 {
3891     j_rev_dct (block);
3892     put_pixels_clamped_c(block, dest, line_size);
3893 }
3894 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3895 {
3896     j_rev_dct (block);
3897     add_pixels_clamped_c(block, dest, line_size);
3898 }
3899
3900 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3901 {
3902     j_rev_dct4 (block);
3903     put_pixels_clamped4_c(block, dest, line_size);
3904 }
3905 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3906 {
3907     j_rev_dct4 (block);
3908     add_pixels_clamped4_c(block, dest, line_size);
3909 }
3910
3911 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3912 {
3913     j_rev_dct2 (block);
3914     put_pixels_clamped2_c(block, dest, line_size);
3915 }
3916 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3917 {
3918     j_rev_dct2 (block);
3919     add_pixels_clamped2_c(block, dest, line_size);
3920 }
3921
3922 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3923 {
3924     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3925
3926     dest[0] = cm[(block[0] + 4)>>3];
3927 }
3928 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3929 {
3930     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3931
3932     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3933 }
3934
3935 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
3936
3937 /* init static data */
3938 void dsputil_static_init(void)
3939 {
3940     int i;
3941
3942     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3943     for(i=0;i<MAX_NEG_CROP;i++) {
3944         ff_cropTbl[i] = 0;
3945         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3946     }
3947
3948     for(i=0;i<512;i++) {
3949         ff_squareTbl[i] = (i - 256) * (i - 256);
3950     }
3951
3952     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3953 }
3954
3955 int ff_check_alignment(void){
3956     static int did_fail=0;
3957     DECLARE_ALIGNED_16(int, aligned);
3958
3959     if((long)&aligned & 15){
3960         if(!did_fail){
3961 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3962             av_log(NULL, AV_LOG_ERROR,
3963                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3964                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
3965                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
3966                 "Do not report crashes to FFmpeg developers.\n");
3967 #endif
3968             did_fail=1;
3969         }
3970         return -1;
3971     }
3972     return 0;
3973 }
3974
3975 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3976 {
3977     int i;
3978
3979     ff_check_alignment();
3980
3981 #ifdef CONFIG_ENCODERS
3982     if(avctx->dct_algo==FF_DCT_FASTINT) {
3983         c->fdct = fdct_ifast;
3984         c->fdct248 = fdct_ifast248;
3985     }
3986     else if(avctx->dct_algo==FF_DCT_FAAN) {
3987         c->fdct = ff_faandct;
3988         c->fdct248 = ff_faandct248;
3989     }
3990     else {
3991         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3992         c->fdct248 = ff_fdct248_islow;
3993     }
3994 #endif //CONFIG_ENCODERS
3995
3996     if(avctx->lowres==1){
3997         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
3998             c->idct_put= ff_jref_idct4_put;
3999             c->idct_add= ff_jref_idct4_add;
4000         }else{
4001             c->idct_put= ff_h264_lowres_idct_put_c;
4002             c->idct_add= ff_h264_lowres_idct_add_c;
4003         }
4004         c->idct    = j_rev_dct4;
4005         c->idct_permutation_type= FF_NO_IDCT_PERM;
4006     }else if(avctx->lowres==2){
4007         c->idct_put= ff_jref_idct2_put;
4008         c->idct_add= ff_jref_idct2_add;
4009         c->idct    = j_rev_dct2;
4010         c->idct_permutation_type= FF_NO_IDCT_PERM;
4011     }else if(avctx->lowres==3){
4012         c->idct_put= ff_jref_idct1_put;
4013         c->idct_add= ff_jref_idct1_add;
4014         c->idct    = j_rev_dct1;
4015         c->idct_permutation_type= FF_NO_IDCT_PERM;
4016     }else{
4017         if(avctx->idct_algo==FF_IDCT_INT){
4018             c->idct_put= ff_jref_idct_put;
4019             c->idct_add= ff_jref_idct_add;
4020             c->idct    = j_rev_dct;
4021             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4022         }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4023                 avctx->idct_algo==FF_IDCT_VP3){
4024             c->idct_put= ff_vp3_idct_put_c;
4025             c->idct_add= ff_vp3_idct_add_c;
4026             c->idct    = ff_vp3_idct_c;
4027             c->idct_permutation_type= FF_NO_IDCT_PERM;
4028         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4029             c->idct_put= ff_wmv2_idct_put_c;
4030             c->idct_add= ff_wmv2_idct_add_c;
4031             c->idct    = ff_wmv2_idct_c;
4032             c->idct_permutation_type= FF_NO_IDCT_PERM;
4033         }else{ //accurate/default
4034             c->idct_put= ff_simple_idct_put;
4035             c->idct_add= ff_simple_idct_add;
4036             c->idct    = ff_simple_idct;
4037             c->idct_permutation_type= FF_NO_IDCT_PERM;
4038         }
4039     }
4040
4041     if (ENABLE_H264_DECODER) {
4042         c->h264_idct_add= ff_h264_idct_add_c;
4043         c->h264_idct8_add= ff_h264_idct8_add_c;
4044         c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4045         c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4046     }
4047
4048     c->get_pixels = get_pixels_c;
4049     c->diff_pixels = diff_pixels_c;
4050     c->put_pixels_clamped = put_pixels_clamped_c;
4051     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4052     c->add_pixels_clamped = add_pixels_clamped_c;
4053     c->add_pixels8 = add_pixels8_c;
4054     c->add_pixels4 = add_pixels4_c;
4055     c->sum_abs_dctelem = sum_abs_dctelem_c;
4056     c->gmc1 = gmc1_c;
4057     c->gmc = ff_gmc_c;
4058     c->clear_blocks = clear_blocks_c;
4059     c->pix_sum = pix_sum_c;
4060     c->pix_norm1 = pix_norm1_c;
4061
4062     /* TODO [0] 16  [1] 8 */
4063     c->pix_abs[0][0] = pix_abs16_c;
4064     c->pix_abs[0][1] = pix_abs16_x2_c;
4065     c->pix_abs[0][2] = pix_abs16_y2_c;
4066     c->pix_abs[0][3] = pix_abs16_xy2_c;
4067     c->pix_abs[1][0] = pix_abs8_c;
4068     c->pix_abs[1][1] = pix_abs8_x2_c;
4069     c->pix_abs[1][2] = pix_abs8_y2_c;
4070     c->pix_abs[1][3] = pix_abs8_xy2_c;
4071
4072 #define dspfunc(PFX, IDX, NUM) \
4073     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4074     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4075     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4076     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4077
4078     dspfunc(put, 0, 16);
4079     dspfunc(put_no_rnd, 0, 16);
4080     dspfunc(put, 1, 8);
4081     dspfunc(put_no_rnd, 1, 8);
4082     dspfunc(put, 2, 4);
4083     dspfunc(put, 3, 2);
4084
4085     dspfunc(avg, 0, 16);
4086     dspfunc(avg_no_rnd, 0, 16);
4087     dspfunc(avg, 1, 8);
4088     dspfunc(avg_no_rnd, 1, 8);
4089     dspfunc(avg, 2, 4);
4090     dspfunc(avg, 3, 2);
4091 #undef dspfunc
4092
4093     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4094     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4095
4096     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4097     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4098     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4099     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4100     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4101     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4102     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4103     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4104     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4105
4106     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4107     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4108     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4109     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4110     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4111     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4112     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4113     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4114     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4115
4116 #define dspfunc(PFX, IDX, NUM) \
4117     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4118     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4119     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4120     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4121     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4122     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4123     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4124     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4125     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4126     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4127     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4128     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4129     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4130     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4131     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4132     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4133
4134     dspfunc(put_qpel, 0, 16);
4135     dspfunc(put_no_rnd_qpel, 0, 16);
4136
4137     dspfunc(avg_qpel, 0, 16);
4138     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4139
4140     dspfunc(put_qpel, 1, 8);
4141     dspfunc(put_no_rnd_qpel, 1, 8);
4142
4143     dspfunc(avg_qpel, 1, 8);
4144     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4145
4146     dspfunc(put_h264_qpel, 0, 16);
4147     dspfunc(put_h264_qpel, 1, 8);
4148     dspfunc(put_h264_qpel, 2, 4);
4149     dspfunc(put_h264_qpel, 3, 2);
4150     dspfunc(avg_h264_qpel, 0, 16);
4151     dspfunc(avg_h264_qpel, 1, 8);
4152     dspfunc(avg_h264_qpel, 2, 4);
4153
4154 #undef dspfunc
4155     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4156     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4157     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4158     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4159     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4160     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4161     c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4162
4163     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4164     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4165     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4166     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4167     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4168     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4169     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4170     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4171     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4172     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4173     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4174     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4175     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4176     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4177     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4178     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4179     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4180     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4181     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4182     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4183
4184 #ifdef CONFIG_CAVS_DECODER
4185     ff_cavsdsp_init(c,avctx);
4186 #endif
4187 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4188     ff_vc1dsp_init(c,avctx);
4189 #endif
4190 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4191     ff_intrax8dsp_init(c,avctx);
4192 #endif
4193 #if defined(CONFIG_H264_ENCODER)
4194     ff_h264dspenc_init(c,avctx);
4195 #endif
4196
4197     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4198     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4199     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4200     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4201     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4202     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4203     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4204     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4205
4206 #define SET_CMP_FUNC(name) \
4207     c->name[0]= name ## 16_c;\
4208     c->name[1]= name ## 8x8_c;
4209
4210     SET_CMP_FUNC(hadamard8_diff)
4211     c->hadamard8_diff[4]= hadamard8_intra16_c;
4212     SET_CMP_FUNC(dct_sad)
4213     SET_CMP_FUNC(dct_max)
4214 #ifdef CONFIG_GPL
4215     SET_CMP_FUNC(dct264_sad)
4216 #endif
4217     c->sad[0]= pix_abs16_c;
4218     c->sad[1]= pix_abs8_c;
4219     c->sse[0]= sse16_c;
4220     c->sse[1]= sse8_c;
4221     c->sse[2]= sse4_c;
4222     SET_CMP_FUNC(quant_psnr)
4223     SET_CMP_FUNC(rd)
4224     SET_CMP_FUNC(bit)
4225     c->vsad[0]= vsad16_c;
4226     c->vsad[4]= vsad_intra16_c;
4227     c->vsse[0]= vsse16_c;
4228     c->vsse[4]= vsse_intra16_c;
4229     c->nsse[0]= nsse16_c;
4230     c->nsse[1]= nsse8_c;
4231 #ifdef CONFIG_SNOW_ENCODER
4232     c->w53[0]= w53_16_c;
4233     c->w53[1]= w53_8_c;
4234     c->w97[0]= w97_16_c;
4235     c->w97[1]= w97_8_c;
4236 #endif
4237
4238     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4239
4240     c->add_bytes= add_bytes_c;
4241     c->diff_bytes= diff_bytes_c;
4242     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4243     c->bswap_buf= bswap_buf;
4244
4245     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4246     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4247     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4248     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4249     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4250     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4251     c->h264_loop_filter_strength= NULL;
4252
4253     if (ENABLE_ANY_H263) {
4254         c->h263_h_loop_filter= h263_h_loop_filter_c;
4255         c->h263_v_loop_filter= h263_v_loop_filter_c;
4256     }
4257
4258     c->h261_loop_filter= h261_loop_filter_c;
4259
4260     c->try_8x8basis= try_8x8basis_c;
4261     c->add_8x8basis= add_8x8basis_c;
4262
4263 #ifdef CONFIG_SNOW_DECODER
4264     c->vertical_compose97i = ff_snow_vertical_compose97i;
4265     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4266     c->inner_add_yblock = ff_snow_inner_add_yblock;
4267 #endif
4268
4269 #ifdef CONFIG_VORBIS_DECODER
4270     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4271 #endif
4272 #ifdef CONFIG_FLAC_ENCODER
4273     c->flac_compute_autocorr = ff_flac_compute_autocorr;
4274 #endif
4275     c->vector_fmul = vector_fmul_c;
4276     c->vector_fmul_reverse = vector_fmul_reverse_c;
4277     c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4278     c->float_to_int16 = ff_float_to_int16_c;
4279
4280     c->shrink[0]= ff_img_copy_plane;
4281     c->shrink[1]= ff_shrink22;
4282     c->shrink[2]= ff_shrink44;
4283     c->shrink[3]= ff_shrink88;
4284
4285     c->prefetch= just_return;
4286
4287     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4288     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4289
4290     if (ENABLE_MMX)      dsputil_init_mmx   (c, avctx);
4291     if (ENABLE_ARMV4L)   dsputil_init_armv4l(c, avctx);
4292     if (ENABLE_MLIB)     dsputil_init_mlib  (c, avctx);
4293     if (ENABLE_VIS)      dsputil_init_vis   (c, avctx);
4294     if (ENABLE_ALPHA)    dsputil_init_alpha (c, avctx);
4295     if (ENABLE_POWERPC)  dsputil_init_ppc   (c, avctx);
4296     if (ENABLE_MMI)      dsputil_init_mmi   (c, avctx);
4297     if (ENABLE_SH4)      dsputil_init_sh4   (c, avctx);
4298     if (ENABLE_BFIN)     dsputil_init_bfin  (c, avctx);
4299
4300     for(i=0; i<64; i++){
4301         if(!c->put_2tap_qpel_pixels_tab[0][i])
4302             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4303         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4304             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4305     }
4306
4307     switch(c->idct_permutation_type){
4308     case FF_NO_IDCT_PERM:
4309         for(i=0; i<64; i++)
4310             c->idct_permutation[i]= i;
4311         break;
4312     case FF_LIBMPEG2_IDCT_PERM:
4313         for(i=0; i<64; i++)
4314             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4315         break;
4316     case FF_SIMPLE_IDCT_PERM:
4317         for(i=0; i<64; i++)
4318             c->idct_permutation[i]= simple_mmx_permutation[i];
4319         break;
4320     case FF_TRANSPOSE_IDCT_PERM:
4321         for(i=0; i<64; i++)
4322             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4323         break;
4324     case FF_PARTTRANS_IDCT_PERM:
4325         for(i=0; i<64; i++)
4326             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4327         break;
4328     default:
4329         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4330     }
4331 }
4332