git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file dsputil.c
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "mpegvideo.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "h263.h"
  36 #include "snow.h"
  37
  38 /* snow.c */
  39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  40
  41 /* vorbis.c */
  42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
  43
  44 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  45 uint32_t ff_squareTbl[512] = {0, };
  46
  47 const uint8_t ff_zigzag_direct[64] = {
  48     0,   1,  8, 16,  9,  2,  3, 10,
  49     17, 24, 32, 25, 18, 11,  4,  5,
  50     12, 19, 26, 33, 40, 48, 41, 34,
  51     27, 20, 13,  6,  7, 14, 21, 28,
  52     35, 42, 49, 56, 57, 50, 43, 36,
  53     29, 22, 15, 23, 30, 37, 44, 51,
  54     58, 59, 52, 45, 38, 31, 39, 46,
  55     53, 60, 61, 54, 47, 55, 62, 63
  56 };
  57
  58 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  59    specification, we interleave the fields */
  60 const uint8_t ff_zigzag248_direct[64] = {
  61      0,  8,  1,  9, 16, 24,  2, 10,
  62     17, 25, 32, 40, 48, 56, 33, 41,
  63     18, 26,  3, 11,  4, 12, 19, 27,
  64     34, 42, 49, 57, 50, 58, 35, 43,
  65     20, 28,  5, 13,  6, 14, 21, 29,
  66     36, 44, 51, 59, 52, 60, 37, 45,
  67     22, 30,  7, 15, 23, 31, 38, 46,
  68     53, 61, 54, 62, 39, 47, 55, 63,
  69 };
  70
  71 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  72 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
  73
  74 const uint8_t ff_alternate_horizontal_scan[64] = {
  75     0,  1,   2,  3,  8,  9, 16, 17,
  76     10, 11,  4,  5,  6,  7, 15, 14,
  77     13, 12, 19, 18, 24, 25, 32, 33,
  78     26, 27, 20, 21, 22, 23, 28, 29,
  79     30, 31, 34, 35, 40, 41, 48, 49,
  80     42, 43, 36, 37, 38, 39, 44, 45,
  81     46, 47, 50, 51, 56, 57, 58, 59,
  82     52, 53, 54, 55, 60, 61, 62, 63,
  83 };
  84
  85 const uint8_t ff_alternate_vertical_scan[64] = {
  86     0,  8,  16, 24,  1,  9,  2, 10,
  87     17, 25, 32, 40, 48, 56, 57, 49,
  88     41, 33, 26, 18,  3, 11,  4, 12,
  89     19, 27, 34, 42, 50, 58, 35, 43,
  90     51, 59, 20, 28,  5, 13,  6, 14,
  91     21, 29, 36, 44, 52, 60, 37, 45,
  92     53, 61, 22, 30,  7, 15, 23, 31,
  93     38, 46, 54, 62, 39, 47, 55, 63,
  94 };
  95
  96 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  97 const uint32_t ff_inverse[256]={
  98          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  99  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 100  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 101  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 102  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 103  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 104   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 105   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 106   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 107   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 108   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 109   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 110   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 111   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 112   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 113   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 114   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 115   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 116   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 117   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 118   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 119   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 120   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 121   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 122   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 123   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 124   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 125   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 126   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 127   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 128   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 129   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 130 };
 131
 132 /* Input permutation for the simple_idct_mmx */
 133 static const uint8_t simple_mmx_permutation[64]={
 134         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 135         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 136         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 137         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 138         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 139         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 140         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 141         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 142 };
 143
 144 static int pix_sum_c(uint8_t * pix, int line_size)
 145 {
 146     int s, i, j;
 147
 148     s = 0;
 149     for (i = 0; i < 16; i++) {
 150         for (j = 0; j < 16; j += 8) {
 151             s += pix[0];
 152             s += pix[1];
 153             s += pix[2];
 154             s += pix[3];
 155             s += pix[4];
 156             s += pix[5];
 157             s += pix[6];
 158             s += pix[7];
 159             pix += 8;
 160         }
 161         pix += line_size - 16;
 162     }
 163     return s;
 164 }
 165
 166 static int pix_norm1_c(uint8_t * pix, int line_size)
 167 {
 168     int s, i, j;
 169     uint32_t *sq = ff_squareTbl + 256;
 170
 171     s = 0;
 172     for (i = 0; i < 16; i++) {
 173         for (j = 0; j < 16; j += 8) {
 174 #if 0
 175             s += sq[pix[0]];
 176             s += sq[pix[1]];
 177             s += sq[pix[2]];
 178             s += sq[pix[3]];
 179             s += sq[pix[4]];
 180             s += sq[pix[5]];
 181             s += sq[pix[6]];
 182             s += sq[pix[7]];
 183 #else
 184 #if LONG_MAX > 2147483647
 185             register uint64_t x=*(uint64_t*)pix;
 186             s += sq[x&0xff];
 187             s += sq[(x>>8)&0xff];
 188             s += sq[(x>>16)&0xff];
 189             s += sq[(x>>24)&0xff];
 190             s += sq[(x>>32)&0xff];
 191             s += sq[(x>>40)&0xff];
 192             s += sq[(x>>48)&0xff];
 193             s += sq[(x>>56)&0xff];
 194 #else
 195             register uint32_t x=*(uint32_t*)pix;
 196             s += sq[x&0xff];
 197             s += sq[(x>>8)&0xff];
 198             s += sq[(x>>16)&0xff];
 199             s += sq[(x>>24)&0xff];
 200             x=*(uint32_t*)(pix+4);
 201             s += sq[x&0xff];
 202             s += sq[(x>>8)&0xff];
 203             s += sq[(x>>16)&0xff];
 204             s += sq[(x>>24)&0xff];
 205 #endif
 206 #endif
 207             pix += 8;
 208         }
 209         pix += line_size - 16;
 210     }
 211     return s;
 212 }
 213
 214 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 215     int i;
 216
 217     for(i=0; i+8<=w; i+=8){
 218         dst[i+0]= bswap_32(src[i+0]);
 219         dst[i+1]= bswap_32(src[i+1]);
 220         dst[i+2]= bswap_32(src[i+2]);
 221         dst[i+3]= bswap_32(src[i+3]);
 222         dst[i+4]= bswap_32(src[i+4]);
 223         dst[i+5]= bswap_32(src[i+5]);
 224         dst[i+6]= bswap_32(src[i+6]);
 225         dst[i+7]= bswap_32(src[i+7]);
 226     }
 227     for(;i<w; i++){
 228         dst[i+0]= bswap_32(src[i+0]);
 229     }
 230 }
 231
 232 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 233 {
 234     int s, i;
 235     uint32_t *sq = ff_squareTbl + 256;
 236
 237     s = 0;
 238     for (i = 0; i < h; i++) {
 239         s += sq[pix1[0] - pix2[0]];
 240         s += sq[pix1[1] - pix2[1]];
 241         s += sq[pix1[2] - pix2[2]];
 242         s += sq[pix1[3] - pix2[3]];
 243         pix1 += line_size;
 244         pix2 += line_size;
 245     }
 246     return s;
 247 }
 248
 249 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 250 {
 251     int s, i;
 252     uint32_t *sq = ff_squareTbl + 256;
 253
 254     s = 0;
 255     for (i = 0; i < h; i++) {
 256         s += sq[pix1[0] - pix2[0]];
 257         s += sq[pix1[1] - pix2[1]];
 258         s += sq[pix1[2] - pix2[2]];
 259         s += sq[pix1[3] - pix2[3]];
 260         s += sq[pix1[4] - pix2[4]];
 261         s += sq[pix1[5] - pix2[5]];
 262         s += sq[pix1[6] - pix2[6]];
 263         s += sq[pix1[7] - pix2[7]];
 264         pix1 += line_size;
 265         pix2 += line_size;
 266     }
 267     return s;
 268 }
 269
 270 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 271 {
 272     int s, i;
 273     uint32_t *sq = ff_squareTbl + 256;
 274
 275     s = 0;
 276     for (i = 0; i < h; i++) {
 277         s += sq[pix1[ 0] - pix2[ 0]];
 278         s += sq[pix1[ 1] - pix2[ 1]];
 279         s += sq[pix1[ 2] - pix2[ 2]];
 280         s += sq[pix1[ 3] - pix2[ 3]];
 281         s += sq[pix1[ 4] - pix2[ 4]];
 282         s += sq[pix1[ 5] - pix2[ 5]];
 283         s += sq[pix1[ 6] - pix2[ 6]];
 284         s += sq[pix1[ 7] - pix2[ 7]];
 285         s += sq[pix1[ 8] - pix2[ 8]];
 286         s += sq[pix1[ 9] - pix2[ 9]];
 287         s += sq[pix1[10] - pix2[10]];
 288         s += sq[pix1[11] - pix2[11]];
 289         s += sq[pix1[12] - pix2[12]];
 290         s += sq[pix1[13] - pix2[13]];
 291         s += sq[pix1[14] - pix2[14]];
 292         s += sq[pix1[15] - pix2[15]];
 293
 294         pix1 += line_size;
 295         pix2 += line_size;
 296     }
 297     return s;
 298 }
 299
 300
 301 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
 302 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 303     int s, i, j;
 304     const int dec_count= w==8 ? 3 : 4;
 305     int tmp[32*32];
 306     int level, ori;
 307     static const int scale[2][2][4][4]={
 308       {
 309         {
 310             // 9/7 8x8 dec=3
 311             {268, 239, 239, 213},
 312             {  0, 224, 224, 152},
 313             {  0, 135, 135, 110},
 314         },{
 315             // 9/7 16x16 or 32x32 dec=4
 316             {344, 310, 310, 280},
 317             {  0, 320, 320, 228},
 318             {  0, 175, 175, 136},
 319             {  0, 129, 129, 102},
 320         }
 321       },{
 322         {
 323             // 5/3 8x8 dec=3
 324             {275, 245, 245, 218},
 325             {  0, 230, 230, 156},
 326             {  0, 138, 138, 113},
 327         },{
 328             // 5/3 16x16 or 32x32 dec=4
 329             {352, 317, 317, 286},
 330             {  0, 328, 328, 233},
 331             {  0, 180, 180, 140},
 332             {  0, 132, 132, 105},
 333         }
 334       }
 335     };
 336
 337     for (i = 0; i < h; i++) {
 338         for (j = 0; j < w; j+=4) {
 339             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 340             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 341             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 342             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 343         }
 344         pix1 += line_size;
 345         pix2 += line_size;
 346     }
 347
 348     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 349
 350     s=0;
 351     assert(w==h);
 352     for(level=0; level<dec_count; level++){
 353         for(ori= level ? 1 : 0; ori<4; ori++){
 354             int size= w>>(dec_count-level);
 355             int sx= (ori&1) ? size : 0;
 356             int stride= 32<<(dec_count-level);
 357             int sy= (ori&2) ? stride>>1 : 0;
 358
 359             for(i=0; i<size; i++){
 360                 for(j=0; j<size; j++){
 361                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 362                     s += FFABS(v);
 363                 }
 364             }
 365         }
 366     }
 367     assert(s>=0);
 368     return s>>9;
 369 }
 370
 371 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 372     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 373 }
 374
 375 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 376     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 377 }
 378
 379 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 380     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 381 }
 382
 383 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 384     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 385 }
 386
 387 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 388     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 389 }
 390
 391 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 392     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 393 }
 394 #endif
 395
 396 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 397 {
 398     int i;
 399
 400     /* read the pixels */
 401     for(i=0;i<8;i++) {
 402         block[0] = pixels[0];
 403         block[1] = pixels[1];
 404         block[2] = pixels[2];
 405         block[3] = pixels[3];
 406         block[4] = pixels[4];
 407         block[5] = pixels[5];
 408         block[6] = pixels[6];
 409         block[7] = pixels[7];
 410         pixels += line_size;
 411         block += 8;
 412     }
 413 }
 414
 415 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 416                           const uint8_t *s2, int stride){
 417     int i;
 418
 419     /* read the pixels */
 420     for(i=0;i<8;i++) {
 421         block[0] = s1[0] - s2[0];
 422         block[1] = s1[1] - s2[1];
 423         block[2] = s1[2] - s2[2];
 424         block[3] = s1[3] - s2[3];
 425         block[4] = s1[4] - s2[4];
 426         block[5] = s1[5] - s2[5];
 427         block[6] = s1[6] - s2[6];
 428         block[7] = s1[7] - s2[7];
 429         s1 += stride;
 430         s2 += stride;
 431         block += 8;
 432     }
 433 }
 434
 435
 436 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 437                                  int line_size)
 438 {
 439     int i;
 440     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 441
 442     /* read the pixels */
 443     for(i=0;i<8;i++) {
 444         pixels[0] = cm[block[0]];
 445         pixels[1] = cm[block[1]];
 446         pixels[2] = cm[block[2]];
 447         pixels[3] = cm[block[3]];
 448         pixels[4] = cm[block[4]];
 449         pixels[5] = cm[block[5]];
 450         pixels[6] = cm[block[6]];
 451         pixels[7] = cm[block[7]];
 452
 453         pixels += line_size;
 454         block += 8;
 455     }
 456 }
 457
 458 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 459                                  int line_size)
 460 {
 461     int i;
 462     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 463
 464     /* read the pixels */
 465     for(i=0;i<4;i++) {
 466         pixels[0] = cm[block[0]];
 467         pixels[1] = cm[block[1]];
 468         pixels[2] = cm[block[2]];
 469         pixels[3] = cm[block[3]];
 470
 471         pixels += line_size;
 472         block += 8;
 473     }
 474 }
 475
 476 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 477                                  int line_size)
 478 {
 479     int i;
 480     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 481
 482     /* read the pixels */
 483     for(i=0;i<2;i++) {
 484         pixels[0] = cm[block[0]];
 485         pixels[1] = cm[block[1]];
 486
 487         pixels += line_size;
 488         block += 8;
 489     }
 490 }
 491
 492 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 493                                         uint8_t *restrict pixels,
 494                                         int line_size)
 495 {
 496     int i, j;
 497
 498     for (i = 0; i < 8; i++) {
 499         for (j = 0; j < 8; j++) {
 500             if (*block < -128)
 501                 *pixels = 0;
 502             else if (*block > 127)
 503                 *pixels = 255;
 504             else
 505                 *pixels = (uint8_t)(*block + 128);
 506             block++;
 507             pixels++;
 508         }
 509         pixels += (line_size - 8);
 510     }
 511 }
 512
 513 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 514                           int line_size)
 515 {
 516     int i;
 517     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 518
 519     /* read the pixels */
 520     for(i=0;i<8;i++) {
 521         pixels[0] = cm[pixels[0] + block[0]];
 522         pixels[1] = cm[pixels[1] + block[1]];
 523         pixels[2] = cm[pixels[2] + block[2]];
 524         pixels[3] = cm[pixels[3] + block[3]];
 525         pixels[4] = cm[pixels[4] + block[4]];
 526         pixels[5] = cm[pixels[5] + block[5]];
 527         pixels[6] = cm[pixels[6] + block[6]];
 528         pixels[7] = cm[pixels[7] + block[7]];
 529         pixels += line_size;
 530         block += 8;
 531     }
 532 }
 533
 534 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 535                           int line_size)
 536 {
 537     int i;
 538     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 539
 540     /* read the pixels */
 541     for(i=0;i<4;i++) {
 542         pixels[0] = cm[pixels[0] + block[0]];
 543         pixels[1] = cm[pixels[1] + block[1]];
 544         pixels[2] = cm[pixels[2] + block[2]];
 545         pixels[3] = cm[pixels[3] + block[3]];
 546         pixels += line_size;
 547         block += 8;
 548     }
 549 }
 550
 551 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 552                           int line_size)
 553 {
 554     int i;
 555     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 556
 557     /* read the pixels */
 558     for(i=0;i<2;i++) {
 559         pixels[0] = cm[pixels[0] + block[0]];
 560         pixels[1] = cm[pixels[1] + block[1]];
 561         pixels += line_size;
 562         block += 8;
 563     }
 564 }
 565
 566 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 567 {
 568     int i;
 569     for(i=0;i<8;i++) {
 570         pixels[0] += block[0];
 571         pixels[1] += block[1];
 572         pixels[2] += block[2];
 573         pixels[3] += block[3];
 574         pixels[4] += block[4];
 575         pixels[5] += block[5];
 576         pixels[6] += block[6];
 577         pixels[7] += block[7];
 578         pixels += line_size;
 579         block += 8;
 580     }
 581 }
 582
 583 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 584 {
 585     int i;
 586     for(i=0;i<4;i++) {
 587         pixels[0] += block[0];
 588         pixels[1] += block[1];
 589         pixels[2] += block[2];
 590         pixels[3] += block[3];
 591         pixels += line_size;
 592         block += 4;
 593     }
 594 }
 595
 596 static int sum_abs_dctelem_c(DCTELEM *block)
 597 {
 598     int sum=0, i;
 599     for(i=0; i<64; i++)
 600         sum+= FFABS(block[i]);
 601     return sum;
 602 }
 603
 604 #if 0
 605
 606 #define PIXOP2(OPNAME, OP) \
 607 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 608 {\
 609     int i;\
 610     for(i=0; i<h; i++){\
 611         OP(*((uint64_t*)block), LD64(pixels));\
 612         pixels+=line_size;\
 613         block +=line_size;\
 614     }\
 615 }\
 616 \
 617 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 618 {\
 619     int i;\
 620     for(i=0; i<h; i++){\
 621         const uint64_t a= LD64(pixels  );\
 622         const uint64_t b= LD64(pixels+1);\
 623         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 624         pixels+=line_size;\
 625         block +=line_size;\
 626     }\
 627 }\
 628 \
 629 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 630 {\
 631     int i;\
 632     for(i=0; i<h; i++){\
 633         const uint64_t a= LD64(pixels  );\
 634         const uint64_t b= LD64(pixels+1);\
 635         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 636         pixels+=line_size;\
 637         block +=line_size;\
 638     }\
 639 }\
 640 \
 641 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 642 {\
 643     int i;\
 644     for(i=0; i<h; i++){\
 645         const uint64_t a= LD64(pixels          );\
 646         const uint64_t b= LD64(pixels+line_size);\
 647         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 648         pixels+=line_size;\
 649         block +=line_size;\
 650     }\
 651 }\
 652 \
 653 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 654 {\
 655     int i;\
 656     for(i=0; i<h; i++){\
 657         const uint64_t a= LD64(pixels          );\
 658         const uint64_t b= LD64(pixels+line_size);\
 659         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 660         pixels+=line_size;\
 661         block +=line_size;\
 662     }\
 663 }\
 664 \
 665 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 666 {\
 667         int i;\
 668         const uint64_t a= LD64(pixels  );\
 669         const uint64_t b= LD64(pixels+1);\
 670         uint64_t l0=  (a&0x0303030303030303ULL)\
 671                     + (b&0x0303030303030303ULL)\
 672                     + 0x0202020202020202ULL;\
 673         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 674                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 675         uint64_t l1,h1;\
 676 \
 677         pixels+=line_size;\
 678         for(i=0; i<h; i+=2){\
 679             uint64_t a= LD64(pixels  );\
 680             uint64_t b= LD64(pixels+1);\
 681             l1=  (a&0x0303030303030303ULL)\
 682                + (b&0x0303030303030303ULL);\
 683             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 684               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 685             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 686             pixels+=line_size;\
 687             block +=line_size;\
 688             a= LD64(pixels  );\
 689             b= LD64(pixels+1);\
 690             l0=  (a&0x0303030303030303ULL)\
 691                + (b&0x0303030303030303ULL)\
 692                + 0x0202020202020202ULL;\
 693             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 694               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 695             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 696             pixels+=line_size;\
 697             block +=line_size;\
 698         }\
 699 }\
 700 \
 701 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 702 {\
 703         int i;\
 704         const uint64_t a= LD64(pixels  );\
 705         const uint64_t b= LD64(pixels+1);\
 706         uint64_t l0=  (a&0x0303030303030303ULL)\
 707                     + (b&0x0303030303030303ULL)\
 708                     + 0x0101010101010101ULL;\
 709         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 710                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 711         uint64_t l1,h1;\
 712 \
 713         pixels+=line_size;\
 714         for(i=0; i<h; i+=2){\
 715             uint64_t a= LD64(pixels  );\
 716             uint64_t b= LD64(pixels+1);\
 717             l1=  (a&0x0303030303030303ULL)\
 718                + (b&0x0303030303030303ULL);\
 719             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 720               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 721             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 722             pixels+=line_size;\
 723             block +=line_size;\
 724             a= LD64(pixels  );\
 725             b= LD64(pixels+1);\
 726             l0=  (a&0x0303030303030303ULL)\
 727                + (b&0x0303030303030303ULL)\
 728                + 0x0101010101010101ULL;\
 729             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 730               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 731             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 732             pixels+=line_size;\
 733             block +=line_size;\
 734         }\
 735 }\
 736 \
 737 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 738 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 739 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 740 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 741 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 742 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 743 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 744
 745 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 746 #else // 64 bit variant
 747
 748 #define PIXOP2(OPNAME, OP) \
 749 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 750     int i;\
 751     for(i=0; i<h; i++){\
 752         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
 753         pixels+=line_size;\
 754         block +=line_size;\
 755     }\
 756 }\
 757 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 758     int i;\
 759     for(i=0; i<h; i++){\
 760         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 761         pixels+=line_size;\
 762         block +=line_size;\
 763     }\
 764 }\
 765 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 766     int i;\
 767     for(i=0; i<h; i++){\
 768         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 769         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 770         pixels+=line_size;\
 771         block +=line_size;\
 772     }\
 773 }\
 774 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 775     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 776 }\
 777 \
 778 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 779                                                 int src_stride1, int src_stride2, int h){\
 780     int i;\
 781     for(i=0; i<h; i++){\
 782         uint32_t a,b;\
 783         a= LD32(&src1[i*src_stride1  ]);\
 784         b= LD32(&src2[i*src_stride2  ]);\
 785         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 786         a= LD32(&src1[i*src_stride1+4]);\
 787         b= LD32(&src2[i*src_stride2+4]);\
 788         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 789     }\
 790 }\
 791 \
 792 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 793                                                 int src_stride1, int src_stride2, int h){\
 794     int i;\
 795     for(i=0; i<h; i++){\
 796         uint32_t a,b;\
 797         a= LD32(&src1[i*src_stride1  ]);\
 798         b= LD32(&src2[i*src_stride2  ]);\
 799         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 800         a= LD32(&src1[i*src_stride1+4]);\
 801         b= LD32(&src2[i*src_stride2+4]);\
 802         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 803     }\
 804 }\
 805 \
 806 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 807                                                 int src_stride1, int src_stride2, int h){\
 808     int i;\
 809     for(i=0; i<h; i++){\
 810         uint32_t a,b;\
 811         a= LD32(&src1[i*src_stride1  ]);\
 812         b= LD32(&src2[i*src_stride2  ]);\
 813         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 814     }\
 815 }\
 816 \
 817 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 818                                                 int src_stride1, int src_stride2, int h){\
 819     int i;\
 820     for(i=0; i<h; i++){\
 821         uint32_t a,b;\
 822         a= LD16(&src1[i*src_stride1  ]);\
 823         b= LD16(&src2[i*src_stride2  ]);\
 824         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 825     }\
 826 }\
 827 \
 828 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 829                                                 int src_stride1, int src_stride2, int h){\
 830     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 831     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 832 }\
 833 \
 834 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 835                                                 int src_stride1, int src_stride2, int h){\
 836     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 837     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 838 }\
 839 \
 840 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 841     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 842 }\
 843 \
 844 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 845     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 846 }\
 847 \
 848 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 849     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 850 }\
 851 \
 852 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 853     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 854 }\
 855 \
 856 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 857                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 858     int i;\
 859     for(i=0; i<h; i++){\
 860         uint32_t a, b, c, d, l0, l1, h0, h1;\
 861         a= LD32(&src1[i*src_stride1]);\
 862         b= LD32(&src2[i*src_stride2]);\
 863         c= LD32(&src3[i*src_stride3]);\
 864         d= LD32(&src4[i*src_stride4]);\
 865         l0=  (a&0x03030303UL)\
 866            + (b&0x03030303UL)\
 867            + 0x02020202UL;\
 868         h0= ((a&0xFCFCFCFCUL)>>2)\
 869           + ((b&0xFCFCFCFCUL)>>2);\
 870         l1=  (c&0x03030303UL)\
 871            + (d&0x03030303UL);\
 872         h1= ((c&0xFCFCFCFCUL)>>2)\
 873           + ((d&0xFCFCFCFCUL)>>2);\
 874         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 875         a= LD32(&src1[i*src_stride1+4]);\
 876         b= LD32(&src2[i*src_stride2+4]);\
 877         c= LD32(&src3[i*src_stride3+4]);\
 878         d= LD32(&src4[i*src_stride4+4]);\
 879         l0=  (a&0x03030303UL)\
 880            + (b&0x03030303UL)\
 881            + 0x02020202UL;\
 882         h0= ((a&0xFCFCFCFCUL)>>2)\
 883           + ((b&0xFCFCFCFCUL)>>2);\
 884         l1=  (c&0x03030303UL)\
 885            + (d&0x03030303UL);\
 886         h1= ((c&0xFCFCFCFCUL)>>2)\
 887           + ((d&0xFCFCFCFCUL)>>2);\
 888         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 889     }\
 890 }\
 891 \
 892 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 893     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 894 }\
 895 \
 896 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 897     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 898 }\
 899 \
 900 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 901     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 902 }\
 903 \
 904 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 905     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 906 }\
 907 \
 908 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 909                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 910     int i;\
 911     for(i=0; i<h; i++){\
 912         uint32_t a, b, c, d, l0, l1, h0, h1;\
 913         a= LD32(&src1[i*src_stride1]);\
 914         b= LD32(&src2[i*src_stride2]);\
 915         c= LD32(&src3[i*src_stride3]);\
 916         d= LD32(&src4[i*src_stride4]);\
 917         l0=  (a&0x03030303UL)\
 918            + (b&0x03030303UL)\
 919            + 0x01010101UL;\
 920         h0= ((a&0xFCFCFCFCUL)>>2)\
 921           + ((b&0xFCFCFCFCUL)>>2);\
 922         l1=  (c&0x03030303UL)\
 923            + (d&0x03030303UL);\
 924         h1= ((c&0xFCFCFCFCUL)>>2)\
 925           + ((d&0xFCFCFCFCUL)>>2);\
 926         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 927         a= LD32(&src1[i*src_stride1+4]);\
 928         b= LD32(&src2[i*src_stride2+4]);\
 929         c= LD32(&src3[i*src_stride3+4]);\
 930         d= LD32(&src4[i*src_stride4+4]);\
 931         l0=  (a&0x03030303UL)\
 932            + (b&0x03030303UL)\
 933            + 0x01010101UL;\
 934         h0= ((a&0xFCFCFCFCUL)>>2)\
 935           + ((b&0xFCFCFCFCUL)>>2);\
 936         l1=  (c&0x03030303UL)\
 937            + (d&0x03030303UL);\
 938         h1= ((c&0xFCFCFCFCUL)>>2)\
 939           + ((d&0xFCFCFCFCUL)>>2);\
 940         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 941     }\
 942 }\
 943 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 944                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 945     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 946     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 947 }\
 948 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 949                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 950     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 951     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 952 }\
 953 \
 954 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 955 {\
 956         int i, a0, b0, a1, b1;\
 957         a0= pixels[0];\
 958         b0= pixels[1] + 2;\
 959         a0 += b0;\
 960         b0 += pixels[2];\
 961 \
 962         pixels+=line_size;\
 963         for(i=0; i<h; i+=2){\
 964             a1= pixels[0];\
 965             b1= pixels[1];\
 966             a1 += b1;\
 967             b1 += pixels[2];\
 968 \
 969             block[0]= (a1+a0)>>2; /* FIXME non put */\
 970             block[1]= (b1+b0)>>2;\
 971 \
 972             pixels+=line_size;\
 973             block +=line_size;\
 974 \
 975             a0= pixels[0];\
 976             b0= pixels[1] + 2;\
 977             a0 += b0;\
 978             b0 += pixels[2];\
 979 \
 980             block[0]= (a1+a0)>>2;\
 981             block[1]= (b1+b0)>>2;\
 982             pixels+=line_size;\
 983             block +=line_size;\
 984         }\
 985 }\
 986 \
 987 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 988 {\
 989         int i;\
 990         const uint32_t a= LD32(pixels  );\
 991         const uint32_t b= LD32(pixels+1);\
 992         uint32_t l0=  (a&0x03030303UL)\
 993                     + (b&0x03030303UL)\
 994                     + 0x02020202UL;\
 995         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 996                    + ((b&0xFCFCFCFCUL)>>2);\
 997         uint32_t l1,h1;\
 998 \
 999         pixels+=line_size;\
1000         for(i=0; i<h; i+=2){\
1001             uint32_t a= LD32(pixels  );\
1002             uint32_t b= LD32(pixels+1);\
1003             l1=  (a&0x03030303UL)\
1004                + (b&0x03030303UL);\
1005             h1= ((a&0xFCFCFCFCUL)>>2)\
1006               + ((b&0xFCFCFCFCUL)>>2);\
1007             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1008             pixels+=line_size;\
1009             block +=line_size;\
1010             a= LD32(pixels  );\
1011             b= LD32(pixels+1);\
1012             l0=  (a&0x03030303UL)\
1013                + (b&0x03030303UL)\
1014                + 0x02020202UL;\
1015             h0= ((a&0xFCFCFCFCUL)>>2)\
1016               + ((b&0xFCFCFCFCUL)>>2);\
1017             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1018             pixels+=line_size;\
1019             block +=line_size;\
1020         }\
1021 }\
1022 \
1023 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1024 {\
1025     int j;\
1026     for(j=0; j<2; j++){\
1027         int i;\
1028         const uint32_t a= LD32(pixels  );\
1029         const uint32_t b= LD32(pixels+1);\
1030         uint32_t l0=  (a&0x03030303UL)\
1031                     + (b&0x03030303UL)\
1032                     + 0x02020202UL;\
1033         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1034                    + ((b&0xFCFCFCFCUL)>>2);\
1035         uint32_t l1,h1;\
1036 \
1037         pixels+=line_size;\
1038         for(i=0; i<h; i+=2){\
1039             uint32_t a= LD32(pixels  );\
1040             uint32_t b= LD32(pixels+1);\
1041             l1=  (a&0x03030303UL)\
1042                + (b&0x03030303UL);\
1043             h1= ((a&0xFCFCFCFCUL)>>2)\
1044               + ((b&0xFCFCFCFCUL)>>2);\
1045             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1046             pixels+=line_size;\
1047             block +=line_size;\
1048             a= LD32(pixels  );\
1049             b= LD32(pixels+1);\
1050             l0=  (a&0x03030303UL)\
1051                + (b&0x03030303UL)\
1052                + 0x02020202UL;\
1053             h0= ((a&0xFCFCFCFCUL)>>2)\
1054               + ((b&0xFCFCFCFCUL)>>2);\
1055             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1056             pixels+=line_size;\
1057             block +=line_size;\
1058         }\
1059         pixels+=4-line_size*(h+1);\
1060         block +=4-line_size*h;\
1061     }\
1062 }\
1063 \
1064 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1065 {\
1066     int j;\
1067     for(j=0; j<2; j++){\
1068         int i;\
1069         const uint32_t a= LD32(pixels  );\
1070         const uint32_t b= LD32(pixels+1);\
1071         uint32_t l0=  (a&0x03030303UL)\
1072                     + (b&0x03030303UL)\
1073                     + 0x01010101UL;\
1074         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1075                    + ((b&0xFCFCFCFCUL)>>2);\
1076         uint32_t l1,h1;\
1077 \
1078         pixels+=line_size;\
1079         for(i=0; i<h; i+=2){\
1080             uint32_t a= LD32(pixels  );\
1081             uint32_t b= LD32(pixels+1);\
1082             l1=  (a&0x03030303UL)\
1083                + (b&0x03030303UL);\
1084             h1= ((a&0xFCFCFCFCUL)>>2)\
1085               + ((b&0xFCFCFCFCUL)>>2);\
1086             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1087             pixels+=line_size;\
1088             block +=line_size;\
1089             a= LD32(pixels  );\
1090             b= LD32(pixels+1);\
1091             l0=  (a&0x03030303UL)\
1092                + (b&0x03030303UL)\
1093                + 0x01010101UL;\
1094             h0= ((a&0xFCFCFCFCUL)>>2)\
1095               + ((b&0xFCFCFCFCUL)>>2);\
1096             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1097             pixels+=line_size;\
1098             block +=line_size;\
1099         }\
1100         pixels+=4-line_size*(h+1);\
1101         block +=4-line_size*h;\
1102     }\
1103 }\
1104 \
1105 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1106 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1107 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1108 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1109 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1110 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1111 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1112 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1113
1114 #define op_avg(a, b) a = rnd_avg32(a, b)
1115 #endif
1116 #define op_put(a, b) a = b
1117
1118 PIXOP2(avg, op_avg)
1119 PIXOP2(put, op_put)
1120 #undef op_avg
1121 #undef op_put
1122
1123 #define avg2(a,b) ((a+b+1)>>1)
1124 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1125
1126 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1127     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1128 }
1129
1130 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1131     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1132 }
1133
1134 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1135 {
1136     const int A=(16-x16)*(16-y16);
1137     const int B=(   x16)*(16-y16);
1138     const int C=(16-x16)*(   y16);
1139     const int D=(   x16)*(   y16);
1140     int i;
1141
1142     for(i=0; i<h; i++)
1143     {
1144         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1145         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1146         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1147         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1148         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1149         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1150         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1151         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1152         dst+= stride;
1153         src+= stride;
1154     }
1155 }
1156
1157 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1158                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1159 {
1160     int y, vx, vy;
1161     const int s= 1<<shift;
1162
1163     width--;
1164     height--;
1165
1166     for(y=0; y<h; y++){
1167         int x;
1168
1169         vx= ox;
1170         vy= oy;
1171         for(x=0; x<8; x++){ //XXX FIXME optimize
1172             int src_x, src_y, frac_x, frac_y, index;
1173
1174             src_x= vx>>16;
1175             src_y= vy>>16;
1176             frac_x= src_x&(s-1);
1177             frac_y= src_y&(s-1);
1178             src_x>>=shift;
1179             src_y>>=shift;
1180
1181             if((unsigned)src_x < width){
1182                 if((unsigned)src_y < height){
1183                     index= src_x + src_y*stride;
1184                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1185                                            + src[index       +1]*   frac_x )*(s-frac_y)
1186                                         + (  src[index+stride  ]*(s-frac_x)
1187                                            + src[index+stride+1]*   frac_x )*   frac_y
1188                                         + r)>>(shift*2);
1189                 }else{
1190                     index= src_x + av_clip(src_y, 0, height)*stride;
1191                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1192                                           + src[index       +1]*   frac_x )*s
1193                                         + r)>>(shift*2);
1194                 }
1195             }else{
1196                 if((unsigned)src_y < height){
1197                     index= av_clip(src_x, 0, width) + src_y*stride;
1198                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1199                                            + src[index+stride  ]*   frac_y )*s
1200                                         + r)>>(shift*2);
1201                 }else{
1202                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1203                     dst[y*stride + x]=    src[index         ];
1204                 }
1205             }
1206
1207             vx+= dxx;
1208             vy+= dyx;
1209         }
1210         ox += dxy;
1211         oy += dyy;
1212     }
1213 }
1214
1215 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1216     switch(width){
1217     case 2: put_pixels2_c (dst, src, stride, height); break;
1218     case 4: put_pixels4_c (dst, src, stride, height); break;
1219     case 8: put_pixels8_c (dst, src, stride, height); break;
1220     case 16:put_pixels16_c(dst, src, stride, height); break;
1221     }
1222 }
1223
1224 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1225     int i,j;
1226     for (i=0; i < height; i++) {
1227       for (j=0; j < width; j++) {
1228         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1229       }
1230       src += stride;
1231       dst += stride;
1232     }
1233 }
1234
1235 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1236     int i,j;
1237     for (i=0; i < height; i++) {
1238       for (j=0; j < width; j++) {
1239         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1240       }
1241       src += stride;
1242       dst += stride;
1243     }
1244 }
1245
1246 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1247     int i,j;
1248     for (i=0; i < height; i++) {
1249       for (j=0; j < width; j++) {
1250         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1251       }
1252       src += stride;
1253       dst += stride;
1254     }
1255 }
1256
1257 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1258     int i,j;
1259     for (i=0; i < height; i++) {
1260       for (j=0; j < width; j++) {
1261         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1262       }
1263       src += stride;
1264       dst += stride;
1265     }
1266 }
1267
1268 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1269     int i,j;
1270     for (i=0; i < height; i++) {
1271       for (j=0; j < width; j++) {
1272         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1273       }
1274       src += stride;
1275       dst += stride;
1276     }
1277 }
1278
1279 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280     int i,j;
1281     for (i=0; i < height; i++) {
1282       for (j=0; j < width; j++) {
1283         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1284       }
1285       src += stride;
1286       dst += stride;
1287     }
1288 }
1289
1290 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291     int i,j;
1292     for (i=0; i < height; i++) {
1293       for (j=0; j < width; j++) {
1294         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1295       }
1296       src += stride;
1297       dst += stride;
1298     }
1299 }
1300
1301 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1302     int i,j;
1303     for (i=0; i < height; i++) {
1304       for (j=0; j < width; j++) {
1305         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1306       }
1307       src += stride;
1308       dst += stride;
1309     }
1310 }
1311
1312 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1313     switch(width){
1314     case 2: avg_pixels2_c (dst, src, stride, height); break;
1315     case 4: avg_pixels4_c (dst, src, stride, height); break;
1316     case 8: avg_pixels8_c (dst, src, stride, height); break;
1317     case 16:avg_pixels16_c(dst, src, stride, height); break;
1318     }
1319 }
1320
1321 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1322     int i,j;
1323     for (i=0; i < height; i++) {
1324       for (j=0; j < width; j++) {
1325         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1326       }
1327       src += stride;
1328       dst += stride;
1329     }
1330 }
1331
1332 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1333     int i,j;
1334     for (i=0; i < height; i++) {
1335       for (j=0; j < width; j++) {
1336         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1337       }
1338       src += stride;
1339       dst += stride;
1340     }
1341 }
1342
1343 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1344     int i,j;
1345     for (i=0; i < height; i++) {
1346       for (j=0; j < width; j++) {
1347         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1348       }
1349       src += stride;
1350       dst += stride;
1351     }
1352 }
1353
1354 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355     int i,j;
1356     for (i=0; i < height; i++) {
1357       for (j=0; j < width; j++) {
1358         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1359       }
1360       src += stride;
1361       dst += stride;
1362     }
1363 }
1364
1365 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1366     int i,j;
1367     for (i=0; i < height; i++) {
1368       for (j=0; j < width; j++) {
1369         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1370       }
1371       src += stride;
1372       dst += stride;
1373     }
1374 }
1375
1376 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377     int i,j;
1378     for (i=0; i < height; i++) {
1379       for (j=0; j < width; j++) {
1380         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1381       }
1382       src += stride;
1383       dst += stride;
1384     }
1385 }
1386
1387 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1388     int i,j;
1389     for (i=0; i < height; i++) {
1390       for (j=0; j < width; j++) {
1391         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1392       }
1393       src += stride;
1394       dst += stride;
1395     }
1396 }
1397
1398 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1399     int i,j;
1400     for (i=0; i < height; i++) {
1401       for (j=0; j < width; j++) {
1402         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1403       }
1404       src += stride;
1405       dst += stride;
1406     }
1407 }
1408 #if 0
1409 #define TPEL_WIDTH(width)\
1410 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1411     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1412 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1413     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1414 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1415     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1416 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1417     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1418 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1419     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1420 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1421     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1422 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1423     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1424 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1425     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1426 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1427     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1428 #endif
1429
1430 #define H264_CHROMA_MC(OPNAME, OP)\
1431 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1432     const int A=(8-x)*(8-y);\
1433     const int B=(  x)*(8-y);\
1434     const int C=(8-x)*(  y);\
1435     const int D=(  x)*(  y);\
1436     int i;\
1437     \
1438     assert(x<8 && y<8 && x>=0 && y>=0);\
1439 \
1440     for(i=0; i<h; i++)\
1441     {\
1442         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1443         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1444         dst+= stride;\
1445         src+= stride;\
1446     }\
1447 }\
1448 \
1449 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1450     const int A=(8-x)*(8-y);\
1451     const int B=(  x)*(8-y);\
1452     const int C=(8-x)*(  y);\
1453     const int D=(  x)*(  y);\
1454     int i;\
1455     \
1456     assert(x<8 && y<8 && x>=0 && y>=0);\
1457 \
1458     for(i=0; i<h; i++)\
1459     {\
1460         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1461         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1462         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1463         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1464         dst+= stride;\
1465         src+= stride;\
1466     }\
1467 }\
1468 \
1469 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1470     const int A=(8-x)*(8-y);\
1471     const int B=(  x)*(8-y);\
1472     const int C=(8-x)*(  y);\
1473     const int D=(  x)*(  y);\
1474     int i;\
1475     \
1476     assert(x<8 && y<8 && x>=0 && y>=0);\
1477 \
1478     for(i=0; i<h; i++)\
1479     {\
1480         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1481         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1482         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1483         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1484         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1485         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1486         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1487         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1488         dst+= stride;\
1489         src+= stride;\
1490     }\
1491 }
1492
1493 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1494 #define op_put(a, b) a = (((b) + 32)>>6)
1495
1496 H264_CHROMA_MC(put_       , op_put)
1497 H264_CHROMA_MC(avg_       , op_avg)
1498 #undef op_avg
1499 #undef op_put
1500
1501 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1502     const int A=(8-x)*(8-y);
1503     const int B=(  x)*(8-y);
1504     const int C=(8-x)*(  y);
1505     const int D=(  x)*(  y);
1506     int i;
1507
1508     assert(x<8 && y<8 && x>=0 && y>=0);
1509
1510     for(i=0; i<h; i++)
1511     {
1512         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1513         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1514         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1515         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1516         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1517         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1518         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1519         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1520         dst+= stride;
1521         src+= stride;
1522     }
1523 }
1524
1525 #define QPEL_MC(r, OPNAME, RND, OP) \
1526 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1527     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1528     int i;\
1529     for(i=0; i<h; i++)\
1530     {\
1531         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1532         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1533         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1534         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1535         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1536         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1537         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1538         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1539         dst+=dstStride;\
1540         src+=srcStride;\
1541     }\
1542 }\
1543 \
1544 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1545     const int w=8;\
1546     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1547     int i;\
1548     for(i=0; i<w; i++)\
1549     {\
1550         const int src0= src[0*srcStride];\
1551         const int src1= src[1*srcStride];\
1552         const int src2= src[2*srcStride];\
1553         const int src3= src[3*srcStride];\
1554         const int src4= src[4*srcStride];\
1555         const int src5= src[5*srcStride];\
1556         const int src6= src[6*srcStride];\
1557         const int src7= src[7*srcStride];\
1558         const int src8= src[8*srcStride];\
1559         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1560         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1561         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1562         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1563         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1564         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1565         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1566         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1567         dst++;\
1568         src++;\
1569     }\
1570 }\
1571 \
1572 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1573     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1574     int i;\
1575     \
1576     for(i=0; i<h; i++)\
1577     {\
1578         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1579         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1580         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1581         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1582         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1583         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1584         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1585         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1586         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1587         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1588         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1589         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1590         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1591         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1592         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1593         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1594         dst+=dstStride;\
1595         src+=srcStride;\
1596     }\
1597 }\
1598 \
1599 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1600     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1601     int i;\
1602     const int w=16;\
1603     for(i=0; i<w; i++)\
1604     {\
1605         const int src0= src[0*srcStride];\
1606         const int src1= src[1*srcStride];\
1607         const int src2= src[2*srcStride];\
1608         const int src3= src[3*srcStride];\
1609         const int src4= src[4*srcStride];\
1610         const int src5= src[5*srcStride];\
1611         const int src6= src[6*srcStride];\
1612         const int src7= src[7*srcStride];\
1613         const int src8= src[8*srcStride];\
1614         const int src9= src[9*srcStride];\
1615         const int src10= src[10*srcStride];\
1616         const int src11= src[11*srcStride];\
1617         const int src12= src[12*srcStride];\
1618         const int src13= src[13*srcStride];\
1619         const int src14= src[14*srcStride];\
1620         const int src15= src[15*srcStride];\
1621         const int src16= src[16*srcStride];\
1622         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1623         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1624         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1625         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1626         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1627         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1628         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1629         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1630         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1631         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1632         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1633         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1634         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1635         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1636         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1637         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1638         dst++;\
1639         src++;\
1640     }\
1641 }\
1642 \
1643 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1644     OPNAME ## pixels8_c(dst, src, stride, 8);\
1645 }\
1646 \
1647 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1648     uint8_t half[64];\
1649     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1650     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1651 }\
1652 \
1653 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1654     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1655 }\
1656 \
1657 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1658     uint8_t half[64];\
1659     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1660     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1661 }\
1662 \
1663 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1664     uint8_t full[16*9];\
1665     uint8_t half[64];\
1666     copy_block9(full, src, 16, stride, 9);\
1667     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1668     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1669 }\
1670 \
1671 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1672     uint8_t full[16*9];\
1673     copy_block9(full, src, 16, stride, 9);\
1674     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1675 }\
1676 \
1677 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1678     uint8_t full[16*9];\
1679     uint8_t half[64];\
1680     copy_block9(full, src, 16, stride, 9);\
1681     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1682     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1683 }\
1684 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1685     uint8_t full[16*9];\
1686     uint8_t halfH[72];\
1687     uint8_t halfV[64];\
1688     uint8_t halfHV[64];\
1689     copy_block9(full, src, 16, stride, 9);\
1690     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1691     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1692     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1693     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1694 }\
1695 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1696     uint8_t full[16*9];\
1697     uint8_t halfH[72];\
1698     uint8_t halfHV[64];\
1699     copy_block9(full, src, 16, stride, 9);\
1700     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1701     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1702     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1703     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1704 }\
1705 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1706     uint8_t full[16*9];\
1707     uint8_t halfH[72];\
1708     uint8_t halfV[64];\
1709     uint8_t halfHV[64];\
1710     copy_block9(full, src, 16, stride, 9);\
1711     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1712     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1713     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1714     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1715 }\
1716 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1717     uint8_t full[16*9];\
1718     uint8_t halfH[72];\
1719     uint8_t halfHV[64];\
1720     copy_block9(full, src, 16, stride, 9);\
1721     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1722     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1723     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1724     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1725 }\
1726 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1727     uint8_t full[16*9];\
1728     uint8_t halfH[72];\
1729     uint8_t halfV[64];\
1730     uint8_t halfHV[64];\
1731     copy_block9(full, src, 16, stride, 9);\
1732     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1733     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1734     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1735     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1736 }\
1737 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1738     uint8_t full[16*9];\
1739     uint8_t halfH[72];\
1740     uint8_t halfHV[64];\
1741     copy_block9(full, src, 16, stride, 9);\
1742     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1743     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1744     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1745     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1746 }\
1747 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1748     uint8_t full[16*9];\
1749     uint8_t halfH[72];\
1750     uint8_t halfV[64];\
1751     uint8_t halfHV[64];\
1752     copy_block9(full, src, 16, stride, 9);\
1753     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1754     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1755     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1756     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1757 }\
1758 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1759     uint8_t full[16*9];\
1760     uint8_t halfH[72];\
1761     uint8_t halfHV[64];\
1762     copy_block9(full, src, 16, stride, 9);\
1763     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1764     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1765     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1766     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1767 }\
1768 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1769     uint8_t halfH[72];\
1770     uint8_t halfHV[64];\
1771     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1772     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1773     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1774 }\
1775 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1776     uint8_t halfH[72];\
1777     uint8_t halfHV[64];\
1778     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1779     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1780     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1781 }\
1782 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1783     uint8_t full[16*9];\
1784     uint8_t halfH[72];\
1785     uint8_t halfV[64];\
1786     uint8_t halfHV[64];\
1787     copy_block9(full, src, 16, stride, 9);\
1788     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1789     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1790     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1791     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1792 }\
1793 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1794     uint8_t full[16*9];\
1795     uint8_t halfH[72];\
1796     copy_block9(full, src, 16, stride, 9);\
1797     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1798     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1799     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1800 }\
1801 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1802     uint8_t full[16*9];\
1803     uint8_t halfH[72];\
1804     uint8_t halfV[64];\
1805     uint8_t halfHV[64];\
1806     copy_block9(full, src, 16, stride, 9);\
1807     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1809     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1810     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1811 }\
1812 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1813     uint8_t full[16*9];\
1814     uint8_t halfH[72];\
1815     copy_block9(full, src, 16, stride, 9);\
1816     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1817     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1818     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1819 }\
1820 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1821     uint8_t halfH[72];\
1822     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1823     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1824 }\
1825 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1826     OPNAME ## pixels16_c(dst, src, stride, 16);\
1827 }\
1828 \
1829 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1830     uint8_t half[256];\
1831     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1832     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1833 }\
1834 \
1835 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1836     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1837 }\
1838 \
1839 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1840     uint8_t half[256];\
1841     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1842     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1843 }\
1844 \
1845 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1846     uint8_t full[24*17];\
1847     uint8_t half[256];\
1848     copy_block17(full, src, 24, stride, 17);\
1849     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1850     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1851 }\
1852 \
1853 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1854     uint8_t full[24*17];\
1855     copy_block17(full, src, 24, stride, 17);\
1856     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1857 }\
1858 \
1859 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1860     uint8_t full[24*17];\
1861     uint8_t half[256];\
1862     copy_block17(full, src, 24, stride, 17);\
1863     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1864     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1865 }\
1866 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1867     uint8_t full[24*17];\
1868     uint8_t halfH[272];\
1869     uint8_t halfV[256];\
1870     uint8_t halfHV[256];\
1871     copy_block17(full, src, 24, stride, 17);\
1872     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1873     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1874     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1875     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1876 }\
1877 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1878     uint8_t full[24*17];\
1879     uint8_t halfH[272];\
1880     uint8_t halfHV[256];\
1881     copy_block17(full, src, 24, stride, 17);\
1882     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1883     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1884     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1885     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1886 }\
1887 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1888     uint8_t full[24*17];\
1889     uint8_t halfH[272];\
1890     uint8_t halfV[256];\
1891     uint8_t halfHV[256];\
1892     copy_block17(full, src, 24, stride, 17);\
1893     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1894     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1895     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1896     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1897 }\
1898 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1899     uint8_t full[24*17];\
1900     uint8_t halfH[272];\
1901     uint8_t halfHV[256];\
1902     copy_block17(full, src, 24, stride, 17);\
1903     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1904     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1905     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1906     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1907 }\
1908 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1909     uint8_t full[24*17];\
1910     uint8_t halfH[272];\
1911     uint8_t halfV[256];\
1912     uint8_t halfHV[256];\
1913     copy_block17(full, src, 24, stride, 17);\
1914     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1915     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1916     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1917     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1918 }\
1919 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1920     uint8_t full[24*17];\
1921     uint8_t halfH[272];\
1922     uint8_t halfHV[256];\
1923     copy_block17(full, src, 24, stride, 17);\
1924     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1925     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1926     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1927     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1928 }\
1929 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1930     uint8_t full[24*17];\
1931     uint8_t halfH[272];\
1932     uint8_t halfV[256];\
1933     uint8_t halfHV[256];\
1934     copy_block17(full, src, 24, stride, 17);\
1935     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1936     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1937     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1938     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1939 }\
1940 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1941     uint8_t full[24*17];\
1942     uint8_t halfH[272];\
1943     uint8_t halfHV[256];\
1944     copy_block17(full, src, 24, stride, 17);\
1945     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1946     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1947     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1948     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1949 }\
1950 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1951     uint8_t halfH[272];\
1952     uint8_t halfHV[256];\
1953     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1954     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1955     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1956 }\
1957 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1958     uint8_t halfH[272];\
1959     uint8_t halfHV[256];\
1960     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1961     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1962     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1963 }\
1964 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1965     uint8_t full[24*17];\
1966     uint8_t halfH[272];\
1967     uint8_t halfV[256];\
1968     uint8_t halfHV[256];\
1969     copy_block17(full, src, 24, stride, 17);\
1970     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1971     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1972     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1973     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1974 }\
1975 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1976     uint8_t full[24*17];\
1977     uint8_t halfH[272];\
1978     copy_block17(full, src, 24, stride, 17);\
1979     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1980     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1981     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1982 }\
1983 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1984     uint8_t full[24*17];\
1985     uint8_t halfH[272];\
1986     uint8_t halfV[256];\
1987     uint8_t halfHV[256];\
1988     copy_block17(full, src, 24, stride, 17);\
1989     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1990     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1991     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1992     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1993 }\
1994 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1995     uint8_t full[24*17];\
1996     uint8_t halfH[272];\
1997     copy_block17(full, src, 24, stride, 17);\
1998     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1999     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2000     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2001 }\
2002 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2003     uint8_t halfH[272];\
2004     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2005     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2006 }
2007
2008 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2009 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2010 #define op_put(a, b) a = cm[((b) + 16)>>5]
2011 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2012
2013 QPEL_MC(0, put_       , _       , op_put)
2014 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2015 QPEL_MC(0, avg_       , _       , op_avg)
2016 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2017 #undef op_avg
2018 #undef op_avg_no_rnd
2019 #undef op_put
2020 #undef op_put_no_rnd
2021
2022 #if 1
2023 #define H264_LOWPASS(OPNAME, OP, OP2) \
2024 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2025     const int h=2;\
2026     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2027     int i;\
2028     for(i=0; i<h; i++)\
2029     {\
2030         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2031         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2032         dst+=dstStride;\
2033         src+=srcStride;\
2034     }\
2035 }\
2036 \
2037 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2038     const int w=2;\
2039     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2040     int i;\
2041     for(i=0; i<w; i++)\
2042     {\
2043         const int srcB= src[-2*srcStride];\
2044         const int srcA= src[-1*srcStride];\
2045         const int src0= src[0 *srcStride];\
2046         const int src1= src[1 *srcStride];\
2047         const int src2= src[2 *srcStride];\
2048         const int src3= src[3 *srcStride];\
2049         const int src4= src[4 *srcStride];\
2050         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2051         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2052         dst++;\
2053         src++;\
2054     }\
2055 }\
2056 \
2057 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2058     const int h=2;\
2059     const int w=2;\
2060     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2061     int i;\
2062     src -= 2*srcStride;\
2063     for(i=0; i<h+5; i++)\
2064     {\
2065         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2066         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2067         tmp+=tmpStride;\
2068         src+=srcStride;\
2069     }\
2070     tmp -= tmpStride*(h+5-2);\
2071     for(i=0; i<w; i++)\
2072     {\
2073         const int tmpB= tmp[-2*tmpStride];\
2074         const int tmpA= tmp[-1*tmpStride];\
2075         const int tmp0= tmp[0 *tmpStride];\
2076         const int tmp1= tmp[1 *tmpStride];\
2077         const int tmp2= tmp[2 *tmpStride];\
2078         const int tmp3= tmp[3 *tmpStride];\
2079         const int tmp4= tmp[4 *tmpStride];\
2080         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2081         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2082         dst++;\
2083         tmp++;\
2084     }\
2085 }\
2086 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2087     const int h=4;\
2088     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2089     int i;\
2090     for(i=0; i<h; i++)\
2091     {\
2092         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2093         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2094         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2095         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2096         dst+=dstStride;\
2097         src+=srcStride;\
2098     }\
2099 }\
2100 \
2101 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2102     const int w=4;\
2103     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2104     int i;\
2105     for(i=0; i<w; i++)\
2106     {\
2107         const int srcB= src[-2*srcStride];\
2108         const int srcA= src[-1*srcStride];\
2109         const int src0= src[0 *srcStride];\
2110         const int src1= src[1 *srcStride];\
2111         const int src2= src[2 *srcStride];\
2112         const int src3= src[3 *srcStride];\
2113         const int src4= src[4 *srcStride];\
2114         const int src5= src[5 *srcStride];\
2115         const int src6= src[6 *srcStride];\
2116         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2117         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2118         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2119         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2120         dst++;\
2121         src++;\
2122     }\
2123 }\
2124 \
2125 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2126     const int h=4;\
2127     const int w=4;\
2128     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2129     int i;\
2130     src -= 2*srcStride;\
2131     for(i=0; i<h+5; i++)\
2132     {\
2133         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2134         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2135         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2136         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2137         tmp+=tmpStride;\
2138         src+=srcStride;\
2139     }\
2140     tmp -= tmpStride*(h+5-2);\
2141     for(i=0; i<w; i++)\
2142     {\
2143         const int tmpB= tmp[-2*tmpStride];\
2144         const int tmpA= tmp[-1*tmpStride];\
2145         const int tmp0= tmp[0 *tmpStride];\
2146         const int tmp1= tmp[1 *tmpStride];\
2147         const int tmp2= tmp[2 *tmpStride];\
2148         const int tmp3= tmp[3 *tmpStride];\
2149         const int tmp4= tmp[4 *tmpStride];\
2150         const int tmp5= tmp[5 *tmpStride];\
2151         const int tmp6= tmp[6 *tmpStride];\
2152         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2153         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2154         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2155         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2156         dst++;\
2157         tmp++;\
2158     }\
2159 }\
2160 \
2161 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2162     const int h=8;\
2163     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2164     int i;\
2165     for(i=0; i<h; i++)\
2166     {\
2167         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2168         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2169         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2170         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2171         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2172         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2173         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2174         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2175         dst+=dstStride;\
2176         src+=srcStride;\
2177     }\
2178 }\
2179 \
2180 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2181     const int w=8;\
2182     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2183     int i;\
2184     for(i=0; i<w; i++)\
2185     {\
2186         const int srcB= src[-2*srcStride];\
2187         const int srcA= src[-1*srcStride];\
2188         const int src0= src[0 *srcStride];\
2189         const int src1= src[1 *srcStride];\
2190         const int src2= src[2 *srcStride];\
2191         const int src3= src[3 *srcStride];\
2192         const int src4= src[4 *srcStride];\
2193         const int src5= src[5 *srcStride];\
2194         const int src6= src[6 *srcStride];\
2195         const int src7= src[7 *srcStride];\
2196         const int src8= src[8 *srcStride];\
2197         const int src9= src[9 *srcStride];\
2198         const int src10=src[10*srcStride];\
2199         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2200         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2201         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2202         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2203         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2204         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2205         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2206         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2207         dst++;\
2208         src++;\
2209     }\
2210 }\
2211 \
2212 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2213     const int h=8;\
2214     const int w=8;\
2215     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2216     int i;\
2217     src -= 2*srcStride;\
2218     for(i=0; i<h+5; i++)\
2219     {\
2220         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2221         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2222         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2223         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2224         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2225         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2226         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2227         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2228         tmp+=tmpStride;\
2229         src+=srcStride;\
2230     }\
2231     tmp -= tmpStride*(h+5-2);\
2232     for(i=0; i<w; i++)\
2233     {\
2234         const int tmpB= tmp[-2*tmpStride];\
2235         const int tmpA= tmp[-1*tmpStride];\
2236         const int tmp0= tmp[0 *tmpStride];\
2237         const int tmp1= tmp[1 *tmpStride];\
2238         const int tmp2= tmp[2 *tmpStride];\
2239         const int tmp3= tmp[3 *tmpStride];\
2240         const int tmp4= tmp[4 *tmpStride];\
2241         const int tmp5= tmp[5 *tmpStride];\
2242         const int tmp6= tmp[6 *tmpStride];\
2243         const int tmp7= tmp[7 *tmpStride];\
2244         const int tmp8= tmp[8 *tmpStride];\
2245         const int tmp9= tmp[9 *tmpStride];\
2246         const int tmp10=tmp[10*tmpStride];\
2247         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2248         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2249         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2250         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2251         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2252         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2253         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2254         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2255         dst++;\
2256         tmp++;\
2257     }\
2258 }\
2259 \
2260 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2261     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2262     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2263     src += 8*srcStride;\
2264     dst += 8*dstStride;\
2265     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2266     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2267 }\
2268 \
2269 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2270     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2271     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2272     src += 8*srcStride;\
2273     dst += 8*dstStride;\
2274     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2275     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2276 }\
2277 \
2278 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2279     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2280     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2281     src += 8*srcStride;\
2282     dst += 8*dstStride;\
2283     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2284     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2285 }\
2286
2287 #define H264_MC(OPNAME, SIZE) \
2288 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2289     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2290 }\
2291 \
2292 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2293     uint8_t half[SIZE*SIZE];\
2294     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2295     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2296 }\
2297 \
2298 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2299     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2300 }\
2301 \
2302 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2303     uint8_t half[SIZE*SIZE];\
2304     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2305     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2306 }\
2307 \
2308 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2309     uint8_t full[SIZE*(SIZE+5)];\
2310     uint8_t * const full_mid= full + SIZE*2;\
2311     uint8_t half[SIZE*SIZE];\
2312     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2313     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2314     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2315 }\
2316 \
2317 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2318     uint8_t full[SIZE*(SIZE+5)];\
2319     uint8_t * const full_mid= full + SIZE*2;\
2320     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2321     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2322 }\
2323 \
2324 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2325     uint8_t full[SIZE*(SIZE+5)];\
2326     uint8_t * const full_mid= full + SIZE*2;\
2327     uint8_t half[SIZE*SIZE];\
2328     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2329     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2330     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2331 }\
2332 \
2333 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2334     uint8_t full[SIZE*(SIZE+5)];\
2335     uint8_t * const full_mid= full + SIZE*2;\
2336     uint8_t halfH[SIZE*SIZE];\
2337     uint8_t halfV[SIZE*SIZE];\
2338     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2339     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2340     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2341     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2342 }\
2343 \
2344 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2345     uint8_t full[SIZE*(SIZE+5)];\
2346     uint8_t * const full_mid= full + SIZE*2;\
2347     uint8_t halfH[SIZE*SIZE];\
2348     uint8_t halfV[SIZE*SIZE];\
2349     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2350     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2351     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2352     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2353 }\
2354 \
2355 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2356     uint8_t full[SIZE*(SIZE+5)];\
2357     uint8_t * const full_mid= full + SIZE*2;\
2358     uint8_t halfH[SIZE*SIZE];\
2359     uint8_t halfV[SIZE*SIZE];\
2360     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2361     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2362     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2363     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2364 }\
2365 \
2366 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2367     uint8_t full[SIZE*(SIZE+5)];\
2368     uint8_t * const full_mid= full + SIZE*2;\
2369     uint8_t halfH[SIZE*SIZE];\
2370     uint8_t halfV[SIZE*SIZE];\
2371     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2372     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2373     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2374     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2375 }\
2376 \
2377 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2378     int16_t tmp[SIZE*(SIZE+5)];\
2379     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2380 }\
2381 \
2382 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2383     int16_t tmp[SIZE*(SIZE+5)];\
2384     uint8_t halfH[SIZE*SIZE];\
2385     uint8_t halfHV[SIZE*SIZE];\
2386     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2387     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2388     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2389 }\
2390 \
2391 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2392     int16_t tmp[SIZE*(SIZE+5)];\
2393     uint8_t halfH[SIZE*SIZE];\
2394     uint8_t halfHV[SIZE*SIZE];\
2395     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2396     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2397     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2398 }\
2399 \
2400 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2401     uint8_t full[SIZE*(SIZE+5)];\
2402     uint8_t * const full_mid= full + SIZE*2;\
2403     int16_t tmp[SIZE*(SIZE+5)];\
2404     uint8_t halfV[SIZE*SIZE];\
2405     uint8_t halfHV[SIZE*SIZE];\
2406     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2407     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2408     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2409     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2410 }\
2411 \
2412 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2413     uint8_t full[SIZE*(SIZE+5)];\
2414     uint8_t * const full_mid= full + SIZE*2;\
2415     int16_t tmp[SIZE*(SIZE+5)];\
2416     uint8_t halfV[SIZE*SIZE];\
2417     uint8_t halfHV[SIZE*SIZE];\
2418     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2419     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2420     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2421     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2422 }\
2423
2424 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2425 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2426 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2427 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2428 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2429
2430 H264_LOWPASS(put_       , op_put, op2_put)
2431 H264_LOWPASS(avg_       , op_avg, op2_avg)
2432 H264_MC(put_, 2)
2433 H264_MC(put_, 4)
2434 H264_MC(put_, 8)
2435 H264_MC(put_, 16)
2436 H264_MC(avg_, 4)
2437 H264_MC(avg_, 8)
2438 H264_MC(avg_, 16)
2439
2440 #undef op_avg
2441 #undef op_put
2442 #undef op2_avg
2443 #undef op2_put
2444 #endif
2445
2446 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2447 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2448 #define H264_WEIGHT(W,H) \
2449 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2450     int y; \
2451     offset <<= log2_denom; \
2452     if(log2_denom) offset += 1<<(log2_denom-1); \
2453     for(y=0; y<H; y++, block += stride){ \
2454         op_scale1(0); \
2455         op_scale1(1); \
2456         if(W==2) continue; \
2457         op_scale1(2); \
2458         op_scale1(3); \
2459         if(W==4) continue; \
2460         op_scale1(4); \
2461         op_scale1(5); \
2462         op_scale1(6); \
2463         op_scale1(7); \
2464         if(W==8) continue; \
2465         op_scale1(8); \
2466         op_scale1(9); \
2467         op_scale1(10); \
2468         op_scale1(11); \
2469         op_scale1(12); \
2470         op_scale1(13); \
2471         op_scale1(14); \
2472         op_scale1(15); \
2473     } \
2474 } \
2475 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2476     int y; \
2477     offset = ((offset + 1) | 1) << log2_denom; \
2478     for(y=0; y<H; y++, dst += stride, src += stride){ \
2479         op_scale2(0); \
2480         op_scale2(1); \
2481         if(W==2) continue; \
2482         op_scale2(2); \
2483         op_scale2(3); \
2484         if(W==4) continue; \
2485         op_scale2(4); \
2486         op_scale2(5); \
2487         op_scale2(6); \
2488         op_scale2(7); \
2489         if(W==8) continue; \
2490         op_scale2(8); \
2491         op_scale2(9); \
2492         op_scale2(10); \
2493         op_scale2(11); \
2494         op_scale2(12); \
2495         op_scale2(13); \
2496         op_scale2(14); \
2497         op_scale2(15); \
2498     } \
2499 }
2500
2501 H264_WEIGHT(16,16)
2502 H264_WEIGHT(16,8)
2503 H264_WEIGHT(8,16)
2504 H264_WEIGHT(8,8)
2505 H264_WEIGHT(8,4)
2506 H264_WEIGHT(4,8)
2507 H264_WEIGHT(4,4)
2508 H264_WEIGHT(4,2)
2509 H264_WEIGHT(2,4)
2510 H264_WEIGHT(2,2)
2511
2512 #undef op_scale1
2513 #undef op_scale2
2514 #undef H264_WEIGHT
2515
2516 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2517     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2518     int i;
2519
2520     for(i=0; i<h; i++){
2521         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2522         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2523         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2524         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2525         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2526         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2527         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2528         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2529         dst+=dstStride;
2530         src+=srcStride;
2531     }
2532 }
2533
2534 #ifdef CONFIG_CAVS_DECODER
2535 /* AVS specific */
2536 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2537
2538 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2539     put_pixels8_c(dst, src, stride, 8);
2540 }
2541 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2542     avg_pixels8_c(dst, src, stride, 8);
2543 }
2544 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2545     put_pixels16_c(dst, src, stride, 16);
2546 }
2547 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2548     avg_pixels16_c(dst, src, stride, 16);
2549 }
2550 #endif /* CONFIG_CAVS_DECODER */
2551
2552 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2553 /* VC-1 specific */
2554 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2555
2556 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2557     put_pixels8_c(dst, src, stride, 8);
2558 }
2559 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2560
2561 #if defined(CONFIG_H264_ENCODER)
2562 /* H264 specific */
2563 void ff_h264dsp_init(DSPContext* c, AVCodecContext *avctx);
2564 #endif /* CONFIG_H264_ENCODER */
2565
2566 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2567     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2568     int i;
2569
2570     for(i=0; i<w; i++){
2571         const int src_1= src[ -srcStride];
2572         const int src0 = src[0          ];
2573         const int src1 = src[  srcStride];
2574         const int src2 = src[2*srcStride];
2575         const int src3 = src[3*srcStride];
2576         const int src4 = src[4*srcStride];
2577         const int src5 = src[5*srcStride];
2578         const int src6 = src[6*srcStride];
2579         const int src7 = src[7*srcStride];
2580         const int src8 = src[8*srcStride];
2581         const int src9 = src[9*srcStride];
2582         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2583         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2584         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2585         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2586         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2587         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2588         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2589         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2590         src++;
2591         dst++;
2592     }
2593 }
2594
2595 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2596     put_pixels8_c(dst, src, stride, 8);
2597 }
2598
2599 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2600     uint8_t half[64];
2601     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2602     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2603 }
2604
2605 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2606     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2607 }
2608
2609 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2610     uint8_t half[64];
2611     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2612     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2613 }
2614
2615 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2616     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2617 }
2618
2619 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2620     uint8_t halfH[88];
2621     uint8_t halfV[64];
2622     uint8_t halfHV[64];
2623     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2624     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2625     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2626     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2627 }
2628 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2629     uint8_t halfH[88];
2630     uint8_t halfV[64];
2631     uint8_t halfHV[64];
2632     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2633     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2634     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2635     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2636 }
2637 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2638     uint8_t halfH[88];
2639     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2640     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2641 }
2642
2643 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2644     int x;
2645     const int strength= ff_h263_loop_filter_strength[qscale];
2646
2647     for(x=0; x<8; x++){
2648         int d1, d2, ad1;
2649         int p0= src[x-2*stride];
2650         int p1= src[x-1*stride];
2651         int p2= src[x+0*stride];
2652         int p3= src[x+1*stride];
2653         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2654
2655         if     (d<-2*strength) d1= 0;
2656         else if(d<-  strength) d1=-2*strength - d;
2657         else if(d<   strength) d1= d;
2658         else if(d< 2*strength) d1= 2*strength - d;
2659         else                   d1= 0;
2660
2661         p1 += d1;
2662         p2 -= d1;
2663         if(p1&256) p1= ~(p1>>31);
2664         if(p2&256) p2= ~(p2>>31);
2665
2666         src[x-1*stride] = p1;
2667         src[x+0*stride] = p2;
2668
2669         ad1= FFABS(d1)>>1;
2670
2671         d2= av_clip((p0-p3)/4, -ad1, ad1);
2672
2673         src[x-2*stride] = p0 - d2;
2674         src[x+  stride] = p3 + d2;
2675     }
2676 }
2677
2678 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2679     int y;
2680     const int strength= ff_h263_loop_filter_strength[qscale];
2681
2682     for(y=0; y<8; y++){
2683         int d1, d2, ad1;
2684         int p0= src[y*stride-2];
2685         int p1= src[y*stride-1];
2686         int p2= src[y*stride+0];
2687         int p3= src[y*stride+1];
2688         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2689
2690         if     (d<-2*strength) d1= 0;
2691         else if(d<-  strength) d1=-2*strength - d;
2692         else if(d<   strength) d1= d;
2693         else if(d< 2*strength) d1= 2*strength - d;
2694         else                   d1= 0;
2695
2696         p1 += d1;
2697         p2 -= d1;
2698         if(p1&256) p1= ~(p1>>31);
2699         if(p2&256) p2= ~(p2>>31);
2700
2701         src[y*stride-1] = p1;
2702         src[y*stride+0] = p2;
2703
2704         ad1= FFABS(d1)>>1;
2705
2706         d2= av_clip((p0-p3)/4, -ad1, ad1);
2707
2708         src[y*stride-2] = p0 - d2;
2709         src[y*stride+1] = p3 + d2;
2710     }
2711 }
2712
2713 static void h261_loop_filter_c(uint8_t *src, int stride){
2714     int x,y,xy,yz;
2715     int temp[64];
2716
2717     for(x=0; x<8; x++){
2718         temp[x      ] = 4*src[x           ];
2719         temp[x + 7*8] = 4*src[x + 7*stride];
2720     }
2721     for(y=1; y<7; y++){
2722         for(x=0; x<8; x++){
2723             xy = y * stride + x;
2724             yz = y * 8 + x;
2725             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2726         }
2727     }
2728
2729     for(y=0; y<8; y++){
2730         src[  y*stride] = (temp[  y*8] + 2)>>2;
2731         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2732         for(x=1; x<7; x++){
2733             xy = y * stride + x;
2734             yz = y * 8 + x;
2735             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2736         }
2737     }
2738 }
2739
2740 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2741 {
2742     int i, d;
2743     for( i = 0; i < 4; i++ ) {
2744         if( tc0[i] < 0 ) {
2745             pix += 4*ystride;
2746             continue;
2747         }
2748         for( d = 0; d < 4; d++ ) {
2749             const int p0 = pix[-1*xstride];
2750             const int p1 = pix[-2*xstride];
2751             const int p2 = pix[-3*xstride];
2752             const int q0 = pix[0];
2753             const int q1 = pix[1*xstride];
2754             const int q2 = pix[2*xstride];
2755
2756             if( FFABS( p0 - q0 ) < alpha &&
2757                 FFABS( p1 - p0 ) < beta &&
2758                 FFABS( q1 - q0 ) < beta ) {
2759
2760                 int tc = tc0[i];
2761                 int i_delta;
2762
2763                 if( FFABS( p2 - p0 ) < beta ) {
2764                     pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2765                     tc++;
2766                 }
2767                 if( FFABS( q2 - q0 ) < beta ) {
2768                     pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2769                     tc++;
2770                 }
2771
2772                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2773                 pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2774                 pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2775             }
2776             pix += ystride;
2777         }
2778     }
2779 }
2780 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2781 {
2782     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2783 }
2784 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2785 {
2786     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2787 }
2788
2789 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2790 {
2791     int i, d;
2792     for( i = 0; i < 4; i++ ) {
2793         const int tc = tc0[i];
2794         if( tc <= 0 ) {
2795             pix += 2*ystride;
2796             continue;
2797         }
2798         for( d = 0; d < 2; d++ ) {
2799             const int p0 = pix[-1*xstride];
2800             const int p1 = pix[-2*xstride];
2801             const int q0 = pix[0];
2802             const int q1 = pix[1*xstride];
2803
2804             if( FFABS( p0 - q0 ) < alpha &&
2805                 FFABS( p1 - p0 ) < beta &&
2806                 FFABS( q1 - q0 ) < beta ) {
2807
2808                 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2809
2810                 pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2811                 pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2812             }
2813             pix += ystride;
2814         }
2815     }
2816 }
2817 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2818 {
2819     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2820 }
2821 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2822 {
2823     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2824 }
2825
2826 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2827 {
2828     int d;
2829     for( d = 0; d < 8; d++ ) {
2830         const int p0 = pix[-1*xstride];
2831         const int p1 = pix[-2*xstride];
2832         const int q0 = pix[0];
2833         const int q1 = pix[1*xstride];
2834
2835         if( FFABS( p0 - q0 ) < alpha &&
2836             FFABS( p1 - p0 ) < beta &&
2837             FFABS( q1 - q0 ) < beta ) {
2838
2839             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2840             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2841         }
2842         pix += ystride;
2843     }
2844 }
2845 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2846 {
2847     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2848 }
2849 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2850 {
2851     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2852 }
2853
2854 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2855 {
2856     int s, i;
2857
2858     s = 0;
2859     for(i=0;i<h;i++) {
2860         s += abs(pix1[0] - pix2[0]);
2861         s += abs(pix1[1] - pix2[1]);
2862         s += abs(pix1[2] - pix2[2]);
2863         s += abs(pix1[3] - pix2[3]);
2864         s += abs(pix1[4] - pix2[4]);
2865         s += abs(pix1[5] - pix2[5]);
2866         s += abs(pix1[6] - pix2[6]);
2867         s += abs(pix1[7] - pix2[7]);
2868         s += abs(pix1[8] - pix2[8]);
2869         s += abs(pix1[9] - pix2[9]);
2870         s += abs(pix1[10] - pix2[10]);
2871         s += abs(pix1[11] - pix2[11]);
2872         s += abs(pix1[12] - pix2[12]);
2873         s += abs(pix1[13] - pix2[13]);
2874         s += abs(pix1[14] - pix2[14]);
2875         s += abs(pix1[15] - pix2[15]);
2876         pix1 += line_size;
2877         pix2 += line_size;
2878     }
2879     return s;
2880 }
2881
2882 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2883 {
2884     int s, i;
2885
2886     s = 0;
2887     for(i=0;i<h;i++) {
2888         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2889         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2890         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2891         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2892         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2893         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2894         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2895         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2896         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2897         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2898         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2899         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2900         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2901         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2902         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2903         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2904         pix1 += line_size;
2905         pix2 += line_size;
2906     }
2907     return s;
2908 }
2909
2910 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2911 {
2912     int s, i;
2913     uint8_t *pix3 = pix2 + line_size;
2914
2915     s = 0;
2916     for(i=0;i<h;i++) {
2917         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2918         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2919         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2920         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2921         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2922         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2923         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2924         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2925         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2926         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2927         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2928         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2929         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2930         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2931         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2932         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2933         pix1 += line_size;
2934         pix2 += line_size;
2935         pix3 += line_size;
2936     }
2937     return s;
2938 }
2939
2940 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2941 {
2942     int s, i;
2943     uint8_t *pix3 = pix2 + line_size;
2944
2945     s = 0;
2946     for(i=0;i<h;i++) {
2947         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2948         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2949         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2950         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2951         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2952         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2953         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2954         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2955         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2956         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2957         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2958         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2959         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2960         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2961         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2962         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2963         pix1 += line_size;
2964         pix2 += line_size;
2965         pix3 += line_size;
2966     }
2967     return s;
2968 }
2969
2970 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2971 {
2972     int s, i;
2973
2974     s = 0;
2975     for(i=0;i<h;i++) {
2976         s += abs(pix1[0] - pix2[0]);
2977         s += abs(pix1[1] - pix2[1]);
2978         s += abs(pix1[2] - pix2[2]);
2979         s += abs(pix1[3] - pix2[3]);
2980         s += abs(pix1[4] - pix2[4]);
2981         s += abs(pix1[5] - pix2[5]);
2982         s += abs(pix1[6] - pix2[6]);
2983         s += abs(pix1[7] - pix2[7]);
2984         pix1 += line_size;
2985         pix2 += line_size;
2986     }
2987     return s;
2988 }
2989
2990 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2991 {
2992     int s, i;
2993
2994     s = 0;
2995     for(i=0;i<h;i++) {
2996         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2997         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2998         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2999         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3000         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3001         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3002         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3003         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3004         pix1 += line_size;
3005         pix2 += line_size;
3006     }
3007     return s;
3008 }
3009
3010 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3011 {
3012     int s, i;
3013     uint8_t *pix3 = pix2 + line_size;
3014
3015     s = 0;
3016     for(i=0;i<h;i++) {
3017         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3018         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3019         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3020         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3021         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3022         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3023         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3024         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3025         pix1 += line_size;
3026         pix2 += line_size;
3027         pix3 += line_size;
3028     }
3029     return s;
3030 }
3031
3032 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3033 {
3034     int s, i;
3035     uint8_t *pix3 = pix2 + line_size;
3036
3037     s = 0;
3038     for(i=0;i<h;i++) {
3039         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3040         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3041         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3042         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3043         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3044         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3045         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3046         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3047         pix1 += line_size;
3048         pix2 += line_size;
3049         pix3 += line_size;
3050     }
3051     return s;
3052 }
3053
3054 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3055     MpegEncContext *c = v;
3056     int score1=0;
3057     int score2=0;
3058     int x,y;
3059
3060     for(y=0; y<h; y++){
3061         for(x=0; x<16; x++){
3062             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3063         }
3064         if(y+1<h){
3065             for(x=0; x<15; x++){
3066                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3067                              - s1[x+1] + s1[x+1+stride])
3068                         -FFABS(  s2[x  ] - s2[x  +stride]
3069                              - s2[x+1] + s2[x+1+stride]);
3070             }
3071         }
3072         s1+= stride;
3073         s2+= stride;
3074     }
3075
3076     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3077     else  return score1 + FFABS(score2)*8;
3078 }
3079
3080 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3081     MpegEncContext *c = v;
3082     int score1=0;
3083     int score2=0;
3084     int x,y;
3085
3086     for(y=0; y<h; y++){
3087         for(x=0; x<8; x++){
3088             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3089         }
3090         if(y+1<h){
3091             for(x=0; x<7; x++){
3092                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3093                              - s1[x+1] + s1[x+1+stride])
3094                         -FFABS(  s2[x  ] - s2[x  +stride]
3095                              - s2[x+1] + s2[x+1+stride]);
3096             }
3097         }
3098         s1+= stride;
3099         s2+= stride;
3100     }
3101
3102     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3103     else  return score1 + FFABS(score2)*8;
3104 }
3105
3106 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3107     int i;
3108     unsigned int sum=0;
3109
3110     for(i=0; i<8*8; i++){
3111         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3112         int w= weight[i];
3113         b>>= RECON_SHIFT;
3114         assert(-512<b && b<512);
3115
3116         sum += (w*b)*(w*b)>>4;
3117     }
3118     return sum>>2;
3119 }
3120
3121 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3122     int i;
3123
3124     for(i=0; i<8*8; i++){
3125         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3126     }
3127 }
3128
3129 /**
3130  * permutes an 8x8 block.
3131  * @param block the block which will be permuted according to the given permutation vector
3132  * @param permutation the permutation vector
3133  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3134  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3135  *                  (inverse) permutated to scantable order!
3136  */
3137 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3138 {
3139     int i;
3140     DCTELEM temp[64];
3141
3142     if(last<=0) return;
3143     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3144
3145     for(i=0; i<=last; i++){
3146         const int j= scantable[i];
3147         temp[j]= block[j];
3148         block[j]=0;
3149     }
3150
3151     for(i=0; i<=last; i++){
3152         const int j= scantable[i];
3153         const int perm_j= permutation[j];
3154         block[perm_j]= temp[j];
3155     }
3156 }
3157
3158 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3159     return 0;
3160 }
3161
3162 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3163     int i;
3164
3165     memset(cmp, 0, sizeof(void*)*5);
3166
3167     for(i=0; i<5; i++){
3168         switch(type&0xFF){
3169         case FF_CMP_SAD:
3170             cmp[i]= c->sad[i];
3171             break;
3172         case FF_CMP_SATD:
3173             cmp[i]= c->hadamard8_diff[i];
3174             break;
3175         case FF_CMP_SSE:
3176             cmp[i]= c->sse[i];
3177             break;
3178         case FF_CMP_DCT:
3179             cmp[i]= c->dct_sad[i];
3180             break;
3181         case FF_CMP_DCT264:
3182             cmp[i]= c->dct264_sad[i];
3183             break;
3184         case FF_CMP_DCTMAX:
3185             cmp[i]= c->dct_max[i];
3186             break;
3187         case FF_CMP_PSNR:
3188             cmp[i]= c->quant_psnr[i];
3189             break;
3190         case FF_CMP_BIT:
3191             cmp[i]= c->bit[i];
3192             break;
3193         case FF_CMP_RD:
3194             cmp[i]= c->rd[i];
3195             break;
3196         case FF_CMP_VSAD:
3197             cmp[i]= c->vsad[i];
3198             break;
3199         case FF_CMP_VSSE:
3200             cmp[i]= c->vsse[i];
3201             break;
3202         case FF_CMP_ZERO:
3203             cmp[i]= zero_cmp;
3204             break;
3205         case FF_CMP_NSSE:
3206             cmp[i]= c->nsse[i];
3207             break;
3208 #ifdef CONFIG_SNOW_ENCODER
3209         case FF_CMP_W53:
3210             cmp[i]= c->w53[i];
3211             break;
3212         case FF_CMP_W97:
3213             cmp[i]= c->w97[i];
3214             break;
3215 #endif
3216         default:
3217             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3218         }
3219     }
3220 }
3221
3222 /**
3223  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3224  */
3225 static void clear_blocks_c(DCTELEM *blocks)
3226 {
3227     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3228 }
3229
3230 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3231     int i;
3232     for(i=0; i+7<w; i+=8){
3233         dst[i+0] += src[i+0];
3234         dst[i+1] += src[i+1];
3235         dst[i+2] += src[i+2];
3236         dst[i+3] += src[i+3];
3237         dst[i+4] += src[i+4];
3238         dst[i+5] += src[i+5];
3239         dst[i+6] += src[i+6];
3240         dst[i+7] += src[i+7];
3241     }
3242     for(; i<w; i++)
3243         dst[i+0] += src[i+0];
3244 }
3245
3246 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3247     int i;
3248     for(i=0; i+7<w; i+=8){
3249         dst[i+0] = src1[i+0]-src2[i+0];
3250         dst[i+1] = src1[i+1]-src2[i+1];
3251         dst[i+2] = src1[i+2]-src2[i+2];
3252         dst[i+3] = src1[i+3]-src2[i+3];
3253         dst[i+4] = src1[i+4]-src2[i+4];
3254         dst[i+5] = src1[i+5]-src2[i+5];
3255         dst[i+6] = src1[i+6]-src2[i+6];
3256         dst[i+7] = src1[i+7]-src2[i+7];
3257     }
3258     for(; i<w; i++)
3259         dst[i+0] = src1[i+0]-src2[i+0];
3260 }
3261
3262 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3263     int i;
3264     uint8_t l, lt;
3265
3266     l= *left;
3267     lt= *left_top;
3268
3269     for(i=0; i<w; i++){
3270         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3271         lt= src1[i];
3272         l= src2[i];
3273         dst[i]= l - pred;
3274     }
3275
3276     *left= l;
3277     *left_top= lt;
3278 }
3279
3280 #define BUTTERFLY2(o1,o2,i1,i2) \
3281 o1= (i1)+(i2);\
3282 o2= (i1)-(i2);
3283
3284 #define BUTTERFLY1(x,y) \
3285 {\
3286     int a,b;\
3287     a= x;\
3288     b= y;\
3289     x= a+b;\
3290     y= a-b;\
3291 }
3292
3293 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3294
3295 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3296     int i;
3297     int temp[64];
3298     int sum=0;
3299
3300     assert(h==8);
3301
3302     for(i=0; i<8; i++){
3303         //FIXME try pointer walks
3304         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3305         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3306         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3307         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3308
3309         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3310         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3311         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3312         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3313
3314         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3315         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3316         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3317         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3318     }
3319
3320     for(i=0; i<8; i++){
3321         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3322         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3323         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3324         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3325
3326         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3327         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3328         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3329         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3330
3331         sum +=
3332              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3333             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3334             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3335             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3336     }
3337 #if 0
3338 static int maxi=0;
3339 if(sum>maxi){
3340     maxi=sum;
3341     printf("MAX:%d\n", maxi);
3342 }
3343 #endif
3344     return sum;
3345 }
3346
3347 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3348     int i;
3349     int temp[64];
3350     int sum=0;
3351
3352     assert(h==8);
3353
3354     for(i=0; i<8; i++){
3355         //FIXME try pointer walks
3356         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3357         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3358         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3359         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3360
3361         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3362         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3363         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3364         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3365
3366         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3367         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3368         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3369         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3370     }
3371
3372     for(i=0; i<8; i++){
3373         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3374         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3375         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3376         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3377
3378         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3379         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3380         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3381         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3382
3383         sum +=
3384              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3385             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3386             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3387             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3388     }
3389
3390     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3391
3392     return sum;
3393 }
3394
3395 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3396     MpegEncContext * const s= (MpegEncContext *)c;
3397     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3398     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3399
3400     assert(h==8);
3401
3402     s->dsp.diff_pixels(temp, src1, src2, stride);
3403     s->dsp.fdct(temp);
3404     return s->dsp.sum_abs_dctelem(temp);
3405 }
3406
3407 #ifdef CONFIG_GPL
3408 #define DCT8_1D {\
3409     const int s07 = SRC(0) + SRC(7);\
3410     const int s16 = SRC(1) + SRC(6);\
3411     const int s25 = SRC(2) + SRC(5);\
3412     const int s34 = SRC(3) + SRC(4);\
3413     const int a0 = s07 + s34;\
3414     const int a1 = s16 + s25;\
3415     const int a2 = s07 - s34;\
3416     const int a3 = s16 - s25;\
3417     const int d07 = SRC(0) - SRC(7);\
3418     const int d16 = SRC(1) - SRC(6);\
3419     const int d25 = SRC(2) - SRC(5);\
3420     const int d34 = SRC(3) - SRC(4);\
3421     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3422     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3423     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3424     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3425     DST(0,  a0 + a1     ) ;\
3426     DST(1,  a4 + (a7>>2)) ;\
3427     DST(2,  a2 + (a3>>1)) ;\
3428     DST(3,  a5 + (a6>>2)) ;\
3429     DST(4,  a0 - a1     ) ;\
3430     DST(5,  a6 - (a5>>2)) ;\
3431     DST(6, (a2>>1) - a3 ) ;\
3432     DST(7, (a4>>2) - a7 ) ;\
3433 }
3434
3435 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3436     MpegEncContext * const s= (MpegEncContext *)c;
3437     DCTELEM dct[8][8];
3438     int i;
3439     int sum=0;
3440
3441     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3442
3443 #define SRC(x) dct[i][x]
3444 #define DST(x,v) dct[i][x]= v
3445     for( i = 0; i < 8; i++ )
3446         DCT8_1D
3447 #undef SRC
3448 #undef DST
3449
3450 #define SRC(x) dct[x][i]
3451 #define DST(x,v) sum += FFABS(v)
3452     for( i = 0; i < 8; i++ )
3453         DCT8_1D
3454 #undef SRC
3455 #undef DST
3456     return sum;
3457 }
3458 #endif
3459
3460 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3461     MpegEncContext * const s= (MpegEncContext *)c;
3462     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3463     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3464     int sum=0, i;
3465
3466     assert(h==8);
3467
3468     s->dsp.diff_pixels(temp, src1, src2, stride);
3469     s->dsp.fdct(temp);
3470
3471     for(i=0; i<64; i++)
3472         sum= FFMAX(sum, FFABS(temp[i]));
3473
3474     return sum;
3475 }
3476
3477 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3478     MpegEncContext * const s= (MpegEncContext *)c;
3479     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3480     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3481     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3482     int sum=0, i;
3483
3484     assert(h==8);
3485     s->mb_intra=0;
3486
3487     s->dsp.diff_pixels(temp, src1, src2, stride);
3488
3489     memcpy(bak, temp, 64*sizeof(DCTELEM));
3490
3491     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3492     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3493     simple_idct(temp); //FIXME
3494
3495     for(i=0; i<64; i++)
3496         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3497
3498     return sum;
3499 }
3500
3501 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3502     MpegEncContext * const s= (MpegEncContext *)c;
3503     const uint8_t *scantable= s->intra_scantable.permutated;
3504     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3505     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3506     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3507     uint8_t * const bak= (uint8_t*)aligned_bak;
3508     int i, last, run, bits, level, distoration, start_i;
3509     const int esc_length= s->ac_esc_length;
3510     uint8_t * length;
3511     uint8_t * last_length;
3512
3513     assert(h==8);
3514
3515     for(i=0; i<8; i++){
3516         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3517         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3518     }
3519
3520     s->dsp.diff_pixels(temp, src1, src2, stride);
3521
3522     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3523
3524     bits=0;
3525
3526     if (s->mb_intra) {
3527         start_i = 1;
3528         length     = s->intra_ac_vlc_length;
3529         last_length= s->intra_ac_vlc_last_length;
3530         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3531     } else {
3532         start_i = 0;
3533         length     = s->inter_ac_vlc_length;
3534         last_length= s->inter_ac_vlc_last_length;
3535     }
3536
3537     if(last>=start_i){
3538         run=0;
3539         for(i=start_i; i<last; i++){
3540             int j= scantable[i];
3541             level= temp[j];
3542
3543             if(level){
3544                 level+=64;
3545                 if((level&(~127)) == 0){
3546                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3547                 }else
3548                     bits+= esc_length;
3549                 run=0;
3550             }else
3551                 run++;
3552         }
3553         i= scantable[last];
3554
3555         level= temp[i] + 64;
3556
3557         assert(level - 64);
3558
3559         if((level&(~127)) == 0){
3560             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3561         }else
3562             bits+= esc_length;
3563
3564     }
3565
3566     if(last>=0){
3567         if(s->mb_intra)
3568             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3569         else
3570             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3571     }
3572
3573     s->dsp.idct_add(bak, stride, temp);
3574
3575     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3576
3577     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3578 }
3579
3580 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3581     MpegEncContext * const s= (MpegEncContext *)c;
3582     const uint8_t *scantable= s->intra_scantable.permutated;
3583     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3584     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3585     int i, last, run, bits, level, start_i;
3586     const int esc_length= s->ac_esc_length;
3587     uint8_t * length;
3588     uint8_t * last_length;
3589
3590     assert(h==8);
3591
3592     s->dsp.diff_pixels(temp, src1, src2, stride);
3593
3594     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3595
3596     bits=0;
3597
3598     if (s->mb_intra) {
3599         start_i = 1;
3600         length     = s->intra_ac_vlc_length;
3601         last_length= s->intra_ac_vlc_last_length;
3602         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3603     } else {
3604         start_i = 0;
3605         length     = s->inter_ac_vlc_length;
3606         last_length= s->inter_ac_vlc_last_length;
3607     }
3608
3609     if(last>=start_i){
3610         run=0;
3611         for(i=start_i; i<last; i++){
3612             int j= scantable[i];
3613             level= temp[j];
3614
3615             if(level){
3616                 level+=64;
3617                 if((level&(~127)) == 0){
3618                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3619                 }else
3620                     bits+= esc_length;
3621                 run=0;
3622             }else
3623                 run++;
3624         }
3625         i= scantable[last];
3626
3627         level= temp[i] + 64;
3628
3629         assert(level - 64);
3630
3631         if((level&(~127)) == 0){
3632             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3633         }else
3634             bits+= esc_length;
3635     }
3636
3637     return bits;
3638 }
3639
3640 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3641     int score=0;
3642     int x,y;
3643
3644     for(y=1; y<h; y++){
3645         for(x=0; x<16; x+=4){
3646             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3647                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3648         }
3649         s+= stride;
3650     }
3651
3652     return score;
3653 }
3654
3655 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3656     int score=0;
3657     int x,y;
3658
3659     for(y=1; y<h; y++){
3660         for(x=0; x<16; x++){
3661             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3662         }
3663         s1+= stride;
3664         s2+= stride;
3665     }
3666
3667     return score;
3668 }
3669
3670 #define SQ(a) ((a)*(a))
3671 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3672     int score=0;
3673     int x,y;
3674
3675     for(y=1; y<h; y++){
3676         for(x=0; x<16; x+=4){
3677             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3678                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3679         }
3680         s+= stride;
3681     }
3682
3683     return score;
3684 }
3685
3686 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3687     int score=0;
3688     int x,y;
3689
3690     for(y=1; y<h; y++){
3691         for(x=0; x<16; x++){
3692             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3693         }
3694         s1+= stride;
3695         s2+= stride;
3696     }
3697
3698     return score;
3699 }
3700
3701 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3702                                int size){
3703     int score=0;
3704     int i;
3705     for(i=0; i<size; i++)
3706         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3707     return score;
3708 }
3709
3710 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3711 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3712 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3713 #ifdef CONFIG_GPL
3714 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3715 #endif
3716 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3717 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3718 WARPER8_16_SQ(rd8x8_c, rd16_c)
3719 WARPER8_16_SQ(bit8x8_c, bit16_c)
3720
3721 static void vector_fmul_c(float *dst, const float *src, int len){
3722     int i;
3723     for(i=0; i<len; i++)
3724         dst[i] *= src[i];
3725 }
3726
3727 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3728     int i;
3729     src1 += len-1;
3730     for(i=0; i<len; i++)
3731         dst[i] = src0[i] * src1[-i];
3732 }
3733
3734 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3735     int i;
3736     for(i=0; i<len; i++)
3737         dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3738 }
3739
3740 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3741     int i;
3742     for(i=0; i<len; i++) {
3743         int_fast32_t tmp = ((int32_t*)src)[i];
3744         if(tmp & 0xf0000){
3745             tmp = (0x43c0ffff - tmp)>>31;
3746             // is this faster on some gcc/cpu combinations?
3747 //          if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3748 //          else                 tmp = 0;
3749         }
3750         dst[i] = tmp - 0x8000;
3751     }
3752 }
3753
3754 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3755  converted */
3756 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3757 {
3758     j_rev_dct (block);
3759     put_pixels_clamped_c(block, dest, line_size);
3760 }
3761 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3762 {
3763     j_rev_dct (block);
3764     add_pixels_clamped_c(block, dest, line_size);
3765 }
3766
3767 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3768 {
3769     j_rev_dct4 (block);
3770     put_pixels_clamped4_c(block, dest, line_size);
3771 }
3772 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3773 {
3774     j_rev_dct4 (block);
3775     add_pixels_clamped4_c(block, dest, line_size);
3776 }
3777
3778 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3779 {
3780     j_rev_dct2 (block);
3781     put_pixels_clamped2_c(block, dest, line_size);
3782 }
3783 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3784 {
3785     j_rev_dct2 (block);
3786     add_pixels_clamped2_c(block, dest, line_size);
3787 }
3788
3789 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3790 {
3791     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3792
3793     dest[0] = cm[(block[0] + 4)>>3];
3794 }
3795 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3796 {
3797     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3798
3799     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3800 }
3801
3802 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
3803
3804 /* init static data */
3805 void dsputil_static_init(void)
3806 {
3807     int i;
3808
3809     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3810     for(i=0;i<MAX_NEG_CROP;i++) {
3811         ff_cropTbl[i] = 0;
3812         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3813     }
3814
3815     for(i=0;i<512;i++) {
3816         ff_squareTbl[i] = (i - 256) * (i - 256);
3817     }
3818
3819     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3820 }
3821
3822 int ff_check_alignment(void){
3823     static int did_fail=0;
3824     DECLARE_ALIGNED_16(int, aligned);
3825
3826     if((long)&aligned & 15){
3827         if(!did_fail){
3828 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3829             av_log(NULL, AV_LOG_ERROR,
3830                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3831                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
3832                 "but in the compiler. Do not report crashes to FFmpeg developers.\n");
3833 #endif
3834             did_fail=1;
3835         }
3836         return -1;
3837     }
3838     return 0;
3839 }
3840
3841 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3842 {
3843     int i;
3844
3845     ff_check_alignment();
3846
3847 #ifdef CONFIG_ENCODERS
3848     if(avctx->dct_algo==FF_DCT_FASTINT) {
3849         c->fdct = fdct_ifast;
3850         c->fdct248 = fdct_ifast248;
3851     }
3852     else if(avctx->dct_algo==FF_DCT_FAAN) {
3853         c->fdct = ff_faandct;
3854         c->fdct248 = ff_faandct248;
3855     }
3856     else {
3857         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3858         c->fdct248 = ff_fdct248_islow;
3859     }
3860 #endif //CONFIG_ENCODERS
3861
3862     if(avctx->lowres==1){
3863         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
3864             c->idct_put= ff_jref_idct4_put;
3865             c->idct_add= ff_jref_idct4_add;
3866         }else{
3867             c->idct_put= ff_h264_lowres_idct_put_c;
3868             c->idct_add= ff_h264_lowres_idct_add_c;
3869         }
3870         c->idct    = j_rev_dct4;
3871         c->idct_permutation_type= FF_NO_IDCT_PERM;
3872     }else if(avctx->lowres==2){
3873         c->idct_put= ff_jref_idct2_put;
3874         c->idct_add= ff_jref_idct2_add;
3875         c->idct    = j_rev_dct2;
3876         c->idct_permutation_type= FF_NO_IDCT_PERM;
3877     }else if(avctx->lowres==3){
3878         c->idct_put= ff_jref_idct1_put;
3879         c->idct_add= ff_jref_idct1_add;
3880         c->idct    = j_rev_dct1;
3881         c->idct_permutation_type= FF_NO_IDCT_PERM;
3882     }else{
3883         if(avctx->idct_algo==FF_IDCT_INT){
3884             c->idct_put= ff_jref_idct_put;
3885             c->idct_add= ff_jref_idct_add;
3886             c->idct    = j_rev_dct;
3887             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3888         }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
3889                 avctx->idct_algo==FF_IDCT_VP3){
3890             c->idct_put= ff_vp3_idct_put_c;
3891             c->idct_add= ff_vp3_idct_add_c;
3892             c->idct    = ff_vp3_idct_c;
3893             c->idct_permutation_type= FF_NO_IDCT_PERM;
3894         }else{ //accurate/default
3895             c->idct_put= simple_idct_put;
3896             c->idct_add= simple_idct_add;
3897             c->idct    = simple_idct;
3898             c->idct_permutation_type= FF_NO_IDCT_PERM;
3899         }
3900     }
3901
3902     if (ENABLE_H264_DECODER) {
3903         c->h264_idct_add= ff_h264_idct_add_c;
3904         c->h264_idct8_add= ff_h264_idct8_add_c;
3905         c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3906         c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3907     }
3908
3909     c->get_pixels = get_pixels_c;
3910     c->diff_pixels = diff_pixels_c;
3911     c->put_pixels_clamped = put_pixels_clamped_c;
3912     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3913     c->add_pixels_clamped = add_pixels_clamped_c;
3914     c->add_pixels8 = add_pixels8_c;
3915     c->add_pixels4 = add_pixels4_c;
3916     c->sum_abs_dctelem = sum_abs_dctelem_c;
3917     c->gmc1 = gmc1_c;
3918     c->gmc = ff_gmc_c;
3919     c->clear_blocks = clear_blocks_c;
3920     c->pix_sum = pix_sum_c;
3921     c->pix_norm1 = pix_norm1_c;
3922
3923     /* TODO [0] 16  [1] 8 */
3924     c->pix_abs[0][0] = pix_abs16_c;
3925     c->pix_abs[0][1] = pix_abs16_x2_c;
3926     c->pix_abs[0][2] = pix_abs16_y2_c;
3927     c->pix_abs[0][3] = pix_abs16_xy2_c;
3928     c->pix_abs[1][0] = pix_abs8_c;
3929     c->pix_abs[1][1] = pix_abs8_x2_c;
3930     c->pix_abs[1][2] = pix_abs8_y2_c;
3931     c->pix_abs[1][3] = pix_abs8_xy2_c;
3932
3933 #define dspfunc(PFX, IDX, NUM) \
3934     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3935     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3936     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3937     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3938
3939     dspfunc(put, 0, 16);
3940     dspfunc(put_no_rnd, 0, 16);
3941     dspfunc(put, 1, 8);
3942     dspfunc(put_no_rnd, 1, 8);
3943     dspfunc(put, 2, 4);
3944     dspfunc(put, 3, 2);
3945
3946     dspfunc(avg, 0, 16);
3947     dspfunc(avg_no_rnd, 0, 16);
3948     dspfunc(avg, 1, 8);
3949     dspfunc(avg_no_rnd, 1, 8);
3950     dspfunc(avg, 2, 4);
3951     dspfunc(avg, 3, 2);
3952 #undef dspfunc
3953
3954     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3955     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3956
3957     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3958     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3959     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3960     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3961     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3962     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3963     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3964     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3965     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3966
3967     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3968     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3969     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3970     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3971     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3972     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3973     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3974     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3975     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3976
3977 #define dspfunc(PFX, IDX, NUM) \
3978     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3979     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3980     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3981     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3982     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3983     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3984     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3985     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3986     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3987     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3988     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3989     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3990     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3991     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3992     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3993     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3994
3995     dspfunc(put_qpel, 0, 16);
3996     dspfunc(put_no_rnd_qpel, 0, 16);
3997
3998     dspfunc(avg_qpel, 0, 16);
3999     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4000
4001     dspfunc(put_qpel, 1, 8);
4002     dspfunc(put_no_rnd_qpel, 1, 8);
4003
4004     dspfunc(avg_qpel, 1, 8);
4005     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4006
4007     dspfunc(put_h264_qpel, 0, 16);
4008     dspfunc(put_h264_qpel, 1, 8);
4009     dspfunc(put_h264_qpel, 2, 4);
4010     dspfunc(put_h264_qpel, 3, 2);
4011     dspfunc(avg_h264_qpel, 0, 16);
4012     dspfunc(avg_h264_qpel, 1, 8);
4013     dspfunc(avg_h264_qpel, 2, 4);
4014
4015 #undef dspfunc
4016     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4017     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4018     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4019     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4020     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4021     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4022     c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4023
4024     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4025     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4026     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4027     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4028     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4029     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4030     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4031     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4032     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4033     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4034     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4035     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4036     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4037     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4038     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4039     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4040     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4041     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4042     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4043     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4044
4045 #ifdef CONFIG_CAVS_DECODER
4046     ff_cavsdsp_init(c,avctx);
4047 #endif
4048 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4049     ff_vc1dsp_init(c,avctx);
4050 #endif
4051 #if defined(CONFIG_H264_ENCODER)
4052     ff_h264dsp_init(c,avctx);
4053 #endif
4054
4055     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4056     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4057     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4058     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4059     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4060     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4061     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4062     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4063
4064 #define SET_CMP_FUNC(name) \
4065     c->name[0]= name ## 16_c;\
4066     c->name[1]= name ## 8x8_c;
4067
4068     SET_CMP_FUNC(hadamard8_diff)
4069     c->hadamard8_diff[4]= hadamard8_intra16_c;
4070     SET_CMP_FUNC(dct_sad)
4071     SET_CMP_FUNC(dct_max)
4072 #ifdef CONFIG_GPL
4073     SET_CMP_FUNC(dct264_sad)
4074 #endif
4075     c->sad[0]= pix_abs16_c;
4076     c->sad[1]= pix_abs8_c;
4077     c->sse[0]= sse16_c;
4078     c->sse[1]= sse8_c;
4079     c->sse[2]= sse4_c;
4080     SET_CMP_FUNC(quant_psnr)
4081     SET_CMP_FUNC(rd)
4082     SET_CMP_FUNC(bit)
4083     c->vsad[0]= vsad16_c;
4084     c->vsad[4]= vsad_intra16_c;
4085     c->vsse[0]= vsse16_c;
4086     c->vsse[4]= vsse_intra16_c;
4087     c->nsse[0]= nsse16_c;
4088     c->nsse[1]= nsse8_c;
4089 #ifdef CONFIG_SNOW_ENCODER
4090     c->w53[0]= w53_16_c;
4091     c->w53[1]= w53_8_c;
4092     c->w97[0]= w97_16_c;
4093     c->w97[1]= w97_8_c;
4094 #endif
4095
4096     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4097
4098     c->add_bytes= add_bytes_c;
4099     c->diff_bytes= diff_bytes_c;
4100     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4101     c->bswap_buf= bswap_buf;
4102
4103     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4104     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4105     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4106     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4107     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4108     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4109     c->h264_loop_filter_strength= NULL;
4110
4111     if (ENABLE_ANY_H263) {
4112     c->h263_h_loop_filter= h263_h_loop_filter_c;
4113     c->h263_v_loop_filter= h263_v_loop_filter_c;
4114     }
4115
4116     c->h261_loop_filter= h261_loop_filter_c;
4117
4118     c->try_8x8basis= try_8x8basis_c;
4119     c->add_8x8basis= add_8x8basis_c;
4120
4121 #ifdef CONFIG_SNOW_DECODER
4122     c->vertical_compose97i = ff_snow_vertical_compose97i;
4123     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4124     c->inner_add_yblock = ff_snow_inner_add_yblock;
4125 #endif
4126
4127 #ifdef CONFIG_VORBIS_DECODER
4128     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4129 #endif
4130     c->vector_fmul = vector_fmul_c;
4131     c->vector_fmul_reverse = vector_fmul_reverse_c;
4132     c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4133     c->float_to_int16 = ff_float_to_int16_c;
4134
4135     c->shrink[0]= ff_img_copy_plane;
4136     c->shrink[1]= ff_shrink22;
4137     c->shrink[2]= ff_shrink44;
4138     c->shrink[3]= ff_shrink88;
4139
4140     c->prefetch= just_return;
4141
4142     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4143     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4144
4145     if (ENABLE_MMX)      dsputil_init_mmx   (c, avctx);
4146     if (ENABLE_ARMV4L)   dsputil_init_armv4l(c, avctx);
4147     if (ENABLE_MLIB)     dsputil_init_mlib  (c, avctx);
4148     if (ENABLE_SPARC)    dsputil_init_vis   (c, avctx);
4149     if (ENABLE_ALPHA)    dsputil_init_alpha (c, avctx);
4150     if (ENABLE_POWERPC)  dsputil_init_ppc   (c, avctx);
4151     if (ENABLE_MMI)      dsputil_init_mmi   (c, avctx);
4152     if (ENABLE_SH4)      dsputil_init_sh4   (c, avctx);
4153     if (ENABLE_BFIN)     dsputil_init_bfin  (c, avctx);
4154
4155     for(i=0; i<64; i++){
4156         if(!c->put_2tap_qpel_pixels_tab[0][i])
4157             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4158         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4159             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4160     }
4161
4162     switch(c->idct_permutation_type){
4163     case FF_NO_IDCT_PERM:
4164         for(i=0; i<64; i++)
4165             c->idct_permutation[i]= i;
4166         break;
4167     case FF_LIBMPEG2_IDCT_PERM:
4168         for(i=0; i<64; i++)
4169             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4170         break;
4171     case FF_SIMPLE_IDCT_PERM:
4172         for(i=0; i<64; i++)
4173             c->idct_permutation[i]= simple_mmx_permutation[i];
4174         break;
4175     case FF_TRANSPOSE_IDCT_PERM:
4176         for(i=0; i<64; i++)
4177             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4178         break;
4179     case FF_PARTTRANS_IDCT_PERM:
4180         for(i=0; i<64; i++)
4181             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4182         break;
4183     default:
4184         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4185     }
4186 }
4187