git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file libavcodec/dsputil.c
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "simple_idct.h"
  33 #include "faandct.h"
  34 #include "faanidct.h"
  35 #include "mathops.h"
  36 #include "snow.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39
  40 /* snow.c */
  41 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  42
  43 /* vorbis.c */
  44 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
  45
  46 /* ac3dec.c */
  47 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
  48
  49 /* lpc.c */
  50 void ff_lpc_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
  51
  52 /* pngdec.c */
  53 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
  54
  55 /* eaidct.c */
  56 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
  57
  58 /* binkidct.c */
  59 void ff_bink_idct_c    (DCTELEM *block);
  60 void ff_bink_idct_add_c(uint8_t *dest, int linesize, DCTELEM *block);
  61 void ff_bink_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
  62
  63 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  64 uint32_t ff_squareTbl[512] = {0, };
  65
  66 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  67 #define pb_7f (~0UL/255 * 0x7f)
  68 #define pb_80 (~0UL/255 * 0x80)
  69
  70 const uint8_t ff_zigzag_direct[64] = {
  71     0,   1,  8, 16,  9,  2,  3, 10,
  72     17, 24, 32, 25, 18, 11,  4,  5,
  73     12, 19, 26, 33, 40, 48, 41, 34,
  74     27, 20, 13,  6,  7, 14, 21, 28,
  75     35, 42, 49, 56, 57, 50, 43, 36,
  76     29, 22, 15, 23, 30, 37, 44, 51,
  77     58, 59, 52, 45, 38, 31, 39, 46,
  78     53, 60, 61, 54, 47, 55, 62, 63
  79 };
  80
  81 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  82    specification, we interleave the fields */
  83 const uint8_t ff_zigzag248_direct[64] = {
  84      0,  8,  1,  9, 16, 24,  2, 10,
  85     17, 25, 32, 40, 48, 56, 33, 41,
  86     18, 26,  3, 11,  4, 12, 19, 27,
  87     34, 42, 49, 57, 50, 58, 35, 43,
  88     20, 28,  5, 13,  6, 14, 21, 29,
  89     36, 44, 51, 59, 52, 60, 37, 45,
  90     22, 30,  7, 15, 23, 31, 38, 46,
  91     53, 61, 54, 62, 39, 47, 55, 63,
  92 };
  93
  94 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  95 DECLARE_ALIGNED_16(uint16_t, inv_zigzag_direct16)[64];
  96
  97 const uint8_t ff_alternate_horizontal_scan[64] = {
  98     0,  1,   2,  3,  8,  9, 16, 17,
  99     10, 11,  4,  5,  6,  7, 15, 14,
 100     13, 12, 19, 18, 24, 25, 32, 33,
 101     26, 27, 20, 21, 22, 23, 28, 29,
 102     30, 31, 34, 35, 40, 41, 48, 49,
 103     42, 43, 36, 37, 38, 39, 44, 45,
 104     46, 47, 50, 51, 56, 57, 58, 59,
 105     52, 53, 54, 55, 60, 61, 62, 63,
 106 };
 107
 108 const uint8_t ff_alternate_vertical_scan[64] = {
 109     0,  8,  16, 24,  1,  9,  2, 10,
 110     17, 25, 32, 40, 48, 56, 57, 49,
 111     41, 33, 26, 18,  3, 11,  4, 12,
 112     19, 27, 34, 42, 50, 58, 35, 43,
 113     51, 59, 20, 28,  5, 13,  6, 14,
 114     21, 29, 36, 44, 52, 60, 37, 45,
 115     53, 61, 22, 30,  7, 15, 23, 31,
 116     38, 46, 54, 62, 39, 47, 55, 63,
 117 };
 118
 119 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
 120  * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
 121 const uint32_t ff_inverse[257]={
 122          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 123  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 124  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 125  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 126  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 127  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 128   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 129   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 130   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 131   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 132   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 133   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 134   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 135   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 136   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 137   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 138   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 139   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 140   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 141   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 142   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 143   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 144   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 145   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 146   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 147   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 148   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 149   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 150   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 151   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 152   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 153   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 154   16777216
 155 };
 156
 157 /* Input permutation for the simple_idct_mmx */
 158 static const uint8_t simple_mmx_permutation[64]={
 159         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 160         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 161         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 162         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 163         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 164         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 165         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 166         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 167 };
 168
 169 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 170
 171 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 172     int i;
 173     int end;
 174
 175     st->scantable= src_scantable;
 176
 177     for(i=0; i<64; i++){
 178         int j;
 179         j = src_scantable[i];
 180         st->permutated[i] = permutation[j];
 181 #if ARCH_PPC
 182         st->inverse[j] = i;
 183 #endif
 184     }
 185
 186     end=-1;
 187     for(i=0; i<64; i++){
 188         int j;
 189         j = st->permutated[i];
 190         if(j>end) end=j;
 191         st->raster_end[i]= end;
 192     }
 193 }
 194
 195 static int pix_sum_c(uint8_t * pix, int line_size)
 196 {
 197     int s, i, j;
 198
 199     s = 0;
 200     for (i = 0; i < 16; i++) {
 201         for (j = 0; j < 16; j += 8) {
 202             s += pix[0];
 203             s += pix[1];
 204             s += pix[2];
 205             s += pix[3];
 206             s += pix[4];
 207             s += pix[5];
 208             s += pix[6];
 209             s += pix[7];
 210             pix += 8;
 211         }
 212         pix += line_size - 16;
 213     }
 214     return s;
 215 }
 216
 217 static int pix_norm1_c(uint8_t * pix, int line_size)
 218 {
 219     int s, i, j;
 220     uint32_t *sq = ff_squareTbl + 256;
 221
 222     s = 0;
 223     for (i = 0; i < 16; i++) {
 224         for (j = 0; j < 16; j += 8) {
 225 #if 0
 226             s += sq[pix[0]];
 227             s += sq[pix[1]];
 228             s += sq[pix[2]];
 229             s += sq[pix[3]];
 230             s += sq[pix[4]];
 231             s += sq[pix[5]];
 232             s += sq[pix[6]];
 233             s += sq[pix[7]];
 234 #else
 235 #if LONG_MAX > 2147483647
 236             register uint64_t x=*(uint64_t*)pix;
 237             s += sq[x&0xff];
 238             s += sq[(x>>8)&0xff];
 239             s += sq[(x>>16)&0xff];
 240             s += sq[(x>>24)&0xff];
 241             s += sq[(x>>32)&0xff];
 242             s += sq[(x>>40)&0xff];
 243             s += sq[(x>>48)&0xff];
 244             s += sq[(x>>56)&0xff];
 245 #else
 246             register uint32_t x=*(uint32_t*)pix;
 247             s += sq[x&0xff];
 248             s += sq[(x>>8)&0xff];
 249             s += sq[(x>>16)&0xff];
 250             s += sq[(x>>24)&0xff];
 251             x=*(uint32_t*)(pix+4);
 252             s += sq[x&0xff];
 253             s += sq[(x>>8)&0xff];
 254             s += sq[(x>>16)&0xff];
 255             s += sq[(x>>24)&0xff];
 256 #endif
 257 #endif
 258             pix += 8;
 259         }
 260         pix += line_size - 16;
 261     }
 262     return s;
 263 }
 264
 265 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 266     int i;
 267
 268     for(i=0; i+8<=w; i+=8){
 269         dst[i+0]= bswap_32(src[i+0]);
 270         dst[i+1]= bswap_32(src[i+1]);
 271         dst[i+2]= bswap_32(src[i+2]);
 272         dst[i+3]= bswap_32(src[i+3]);
 273         dst[i+4]= bswap_32(src[i+4]);
 274         dst[i+5]= bswap_32(src[i+5]);
 275         dst[i+6]= bswap_32(src[i+6]);
 276         dst[i+7]= bswap_32(src[i+7]);
 277     }
 278     for(;i<w; i++){
 279         dst[i+0]= bswap_32(src[i+0]);
 280     }
 281 }
 282
 283 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 284 {
 285     int s, i;
 286     uint32_t *sq = ff_squareTbl + 256;
 287
 288     s = 0;
 289     for (i = 0; i < h; i++) {
 290         s += sq[pix1[0] - pix2[0]];
 291         s += sq[pix1[1] - pix2[1]];
 292         s += sq[pix1[2] - pix2[2]];
 293         s += sq[pix1[3] - pix2[3]];
 294         pix1 += line_size;
 295         pix2 += line_size;
 296     }
 297     return s;
 298 }
 299
 300 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 301 {
 302     int s, i;
 303     uint32_t *sq = ff_squareTbl + 256;
 304
 305     s = 0;
 306     for (i = 0; i < h; i++) {
 307         s += sq[pix1[0] - pix2[0]];
 308         s += sq[pix1[1] - pix2[1]];
 309         s += sq[pix1[2] - pix2[2]];
 310         s += sq[pix1[3] - pix2[3]];
 311         s += sq[pix1[4] - pix2[4]];
 312         s += sq[pix1[5] - pix2[5]];
 313         s += sq[pix1[6] - pix2[6]];
 314         s += sq[pix1[7] - pix2[7]];
 315         pix1 += line_size;
 316         pix2 += line_size;
 317     }
 318     return s;
 319 }
 320
 321 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 322 {
 323     int s, i;
 324     uint32_t *sq = ff_squareTbl + 256;
 325
 326     s = 0;
 327     for (i = 0; i < h; i++) {
 328         s += sq[pix1[ 0] - pix2[ 0]];
 329         s += sq[pix1[ 1] - pix2[ 1]];
 330         s += sq[pix1[ 2] - pix2[ 2]];
 331         s += sq[pix1[ 3] - pix2[ 3]];
 332         s += sq[pix1[ 4] - pix2[ 4]];
 333         s += sq[pix1[ 5] - pix2[ 5]];
 334         s += sq[pix1[ 6] - pix2[ 6]];
 335         s += sq[pix1[ 7] - pix2[ 7]];
 336         s += sq[pix1[ 8] - pix2[ 8]];
 337         s += sq[pix1[ 9] - pix2[ 9]];
 338         s += sq[pix1[10] - pix2[10]];
 339         s += sq[pix1[11] - pix2[11]];
 340         s += sq[pix1[12] - pix2[12]];
 341         s += sq[pix1[13] - pix2[13]];
 342         s += sq[pix1[14] - pix2[14]];
 343         s += sq[pix1[15] - pix2[15]];
 344
 345         pix1 += line_size;
 346         pix2 += line_size;
 347     }
 348     return s;
 349 }
 350
 351
 352 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
 353 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 354     int s, i, j;
 355     const int dec_count= w==8 ? 3 : 4;
 356     int tmp[32*32];
 357     int level, ori;
 358     static const int scale[2][2][4][4]={
 359       {
 360         {
 361             // 9/7 8x8 dec=3
 362             {268, 239, 239, 213},
 363             {  0, 224, 224, 152},
 364             {  0, 135, 135, 110},
 365         },{
 366             // 9/7 16x16 or 32x32 dec=4
 367             {344, 310, 310, 280},
 368             {  0, 320, 320, 228},
 369             {  0, 175, 175, 136},
 370             {  0, 129, 129, 102},
 371         }
 372       },{
 373         {
 374             // 5/3 8x8 dec=3
 375             {275, 245, 245, 218},
 376             {  0, 230, 230, 156},
 377             {  0, 138, 138, 113},
 378         },{
 379             // 5/3 16x16 or 32x32 dec=4
 380             {352, 317, 317, 286},
 381             {  0, 328, 328, 233},
 382             {  0, 180, 180, 140},
 383             {  0, 132, 132, 105},
 384         }
 385       }
 386     };
 387
 388     for (i = 0; i < h; i++) {
 389         for (j = 0; j < w; j+=4) {
 390             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 391             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 392             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 393             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 394         }
 395         pix1 += line_size;
 396         pix2 += line_size;
 397     }
 398
 399     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 400
 401     s=0;
 402     assert(w==h);
 403     for(level=0; level<dec_count; level++){
 404         for(ori= level ? 1 : 0; ori<4; ori++){
 405             int size= w>>(dec_count-level);
 406             int sx= (ori&1) ? size : 0;
 407             int stride= 32<<(dec_count-level);
 408             int sy= (ori&2) ? stride>>1 : 0;
 409
 410             for(i=0; i<size; i++){
 411                 for(j=0; j<size; j++){
 412                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 413                     s += FFABS(v);
 414                 }
 415             }
 416         }
 417     }
 418     assert(s>=0);
 419     return s>>9;
 420 }
 421
 422 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 423     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 424 }
 425
 426 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 427     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 428 }
 429
 430 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 431     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 432 }
 433
 434 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 435     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 436 }
 437
 438 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 439     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 440 }
 441
 442 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 443     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 444 }
 445 #endif
 446
 447 /* draw the edges of width 'w' of an image of size width, height */
 448 //FIXME check that this is ok for mpeg4 interlaced
 449 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 450 {
 451     uint8_t *ptr, *last_line;
 452     int i;
 453
 454     last_line = buf + (height - 1) * wrap;
 455     for(i=0;i<w;i++) {
 456         /* top and bottom */
 457         memcpy(buf - (i + 1) * wrap, buf, width);
 458         memcpy(last_line + (i + 1) * wrap, last_line, width);
 459     }
 460     /* left and right */
 461     ptr = buf;
 462     for(i=0;i<height;i++) {
 463         memset(ptr - w, ptr[0], w);
 464         memset(ptr + width, ptr[width-1], w);
 465         ptr += wrap;
 466     }
 467     /* corners */
 468     for(i=0;i<w;i++) {
 469         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 470         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 471         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 472         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 473     }
 474 }
 475
 476 /**
 477  * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
 478  * @param buf destination buffer
 479  * @param src source buffer
 480  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 481  * @param block_w width of block
 482  * @param block_h height of block
 483  * @param src_x x coordinate of the top left sample of the block in the source buffer
 484  * @param src_y y coordinate of the top left sample of the block in the source buffer
 485  * @param w width of the source buffer
 486  * @param h height of the source buffer
 487  */
 488 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
 489                                     int src_x, int src_y, int w, int h){
 490     int x, y;
 491     int start_y, start_x, end_y, end_x;
 492
 493     if(src_y>= h){
 494         src+= (h-1-src_y)*linesize;
 495         src_y=h-1;
 496     }else if(src_y<=-block_h){
 497         src+= (1-block_h-src_y)*linesize;
 498         src_y=1-block_h;
 499     }
 500     if(src_x>= w){
 501         src+= (w-1-src_x);
 502         src_x=w-1;
 503     }else if(src_x<=-block_w){
 504         src+= (1-block_w-src_x);
 505         src_x=1-block_w;
 506     }
 507
 508     start_y= FFMAX(0, -src_y);
 509     start_x= FFMAX(0, -src_x);
 510     end_y= FFMIN(block_h, h-src_y);
 511     end_x= FFMIN(block_w, w-src_x);
 512
 513     // copy existing part
 514     for(y=start_y; y<end_y; y++){
 515         for(x=start_x; x<end_x; x++){
 516             buf[x + y*linesize]= src[x + y*linesize];
 517         }
 518     }
 519
 520     //top
 521     for(y=0; y<start_y; y++){
 522         for(x=start_x; x<end_x; x++){
 523             buf[x + y*linesize]= buf[x + start_y*linesize];
 524         }
 525     }
 526
 527     //bottom
 528     for(y=end_y; y<block_h; y++){
 529         for(x=start_x; x<end_x; x++){
 530             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
 531         }
 532     }
 533
 534     for(y=0; y<block_h; y++){
 535        //left
 536         for(x=0; x<start_x; x++){
 537             buf[x + y*linesize]= buf[start_x + y*linesize];
 538         }
 539
 540        //right
 541         for(x=end_x; x<block_w; x++){
 542             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
 543         }
 544     }
 545 }
 546
 547 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 548 {
 549     int i;
 550
 551     /* read the pixels */
 552     for(i=0;i<8;i++) {
 553         block[0] = pixels[0];
 554         block[1] = pixels[1];
 555         block[2] = pixels[2];
 556         block[3] = pixels[3];
 557         block[4] = pixels[4];
 558         block[5] = pixels[5];
 559         block[6] = pixels[6];
 560         block[7] = pixels[7];
 561         pixels += line_size;
 562         block += 8;
 563     }
 564 }
 565
 566 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 567                           const uint8_t *s2, int stride){
 568     int i;
 569
 570     /* read the pixels */
 571     for(i=0;i<8;i++) {
 572         block[0] = s1[0] - s2[0];
 573         block[1] = s1[1] - s2[1];
 574         block[2] = s1[2] - s2[2];
 575         block[3] = s1[3] - s2[3];
 576         block[4] = s1[4] - s2[4];
 577         block[5] = s1[5] - s2[5];
 578         block[6] = s1[6] - s2[6];
 579         block[7] = s1[7] - s2[7];
 580         s1 += stride;
 581         s2 += stride;
 582         block += 8;
 583     }
 584 }
 585
 586
 587 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 588                                  int line_size)
 589 {
 590     int i;
 591     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 592
 593     /* read the pixels */
 594     for(i=0;i<8;i++) {
 595         pixels[0] = cm[block[0]];
 596         pixels[1] = cm[block[1]];
 597         pixels[2] = cm[block[2]];
 598         pixels[3] = cm[block[3]];
 599         pixels[4] = cm[block[4]];
 600         pixels[5] = cm[block[5]];
 601         pixels[6] = cm[block[6]];
 602         pixels[7] = cm[block[7]];
 603
 604         pixels += line_size;
 605         block += 8;
 606     }
 607 }
 608
 609 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 610                                  int line_size)
 611 {
 612     int i;
 613     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 614
 615     /* read the pixels */
 616     for(i=0;i<4;i++) {
 617         pixels[0] = cm[block[0]];
 618         pixels[1] = cm[block[1]];
 619         pixels[2] = cm[block[2]];
 620         pixels[3] = cm[block[3]];
 621
 622         pixels += line_size;
 623         block += 8;
 624     }
 625 }
 626
 627 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 628                                  int line_size)
 629 {
 630     int i;
 631     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 632
 633     /* read the pixels */
 634     for(i=0;i<2;i++) {
 635         pixels[0] = cm[block[0]];
 636         pixels[1] = cm[block[1]];
 637
 638         pixels += line_size;
 639         block += 8;
 640     }
 641 }
 642
 643 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 644                                         uint8_t *restrict pixels,
 645                                         int line_size)
 646 {
 647     int i, j;
 648
 649     for (i = 0; i < 8; i++) {
 650         for (j = 0; j < 8; j++) {
 651             if (*block < -128)
 652                 *pixels = 0;
 653             else if (*block > 127)
 654                 *pixels = 255;
 655             else
 656                 *pixels = (uint8_t)(*block + 128);
 657             block++;
 658             pixels++;
 659         }
 660         pixels += (line_size - 8);
 661     }
 662 }
 663
 664 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 665                                     int line_size)
 666 {
 667     int i;
 668
 669     /* read the pixels */
 670     for(i=0;i<8;i++) {
 671         pixels[0] = block[0];
 672         pixels[1] = block[1];
 673         pixels[2] = block[2];
 674         pixels[3] = block[3];
 675         pixels[4] = block[4];
 676         pixels[5] = block[5];
 677         pixels[6] = block[6];
 678         pixels[7] = block[7];
 679
 680         pixels += line_size;
 681         block += 8;
 682     }
 683 }
 684
 685 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 686                           int line_size)
 687 {
 688     int i;
 689     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 690
 691     /* read the pixels */
 692     for(i=0;i<8;i++) {
 693         pixels[0] = cm[pixels[0] + block[0]];
 694         pixels[1] = cm[pixels[1] + block[1]];
 695         pixels[2] = cm[pixels[2] + block[2]];
 696         pixels[3] = cm[pixels[3] + block[3]];
 697         pixels[4] = cm[pixels[4] + block[4]];
 698         pixels[5] = cm[pixels[5] + block[5]];
 699         pixels[6] = cm[pixels[6] + block[6]];
 700         pixels[7] = cm[pixels[7] + block[7]];
 701         pixels += line_size;
 702         block += 8;
 703     }
 704 }
 705
 706 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 707                           int line_size)
 708 {
 709     int i;
 710     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 711
 712     /* read the pixels */
 713     for(i=0;i<4;i++) {
 714         pixels[0] = cm[pixels[0] + block[0]];
 715         pixels[1] = cm[pixels[1] + block[1]];
 716         pixels[2] = cm[pixels[2] + block[2]];
 717         pixels[3] = cm[pixels[3] + block[3]];
 718         pixels += line_size;
 719         block += 8;
 720     }
 721 }
 722
 723 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 724                           int line_size)
 725 {
 726     int i;
 727     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 728
 729     /* read the pixels */
 730     for(i=0;i<2;i++) {
 731         pixels[0] = cm[pixels[0] + block[0]];
 732         pixels[1] = cm[pixels[1] + block[1]];
 733         pixels += line_size;
 734         block += 8;
 735     }
 736 }
 737
 738 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 739 {
 740     int i;
 741     for(i=0;i<8;i++) {
 742         pixels[0] += block[0];
 743         pixels[1] += block[1];
 744         pixels[2] += block[2];
 745         pixels[3] += block[3];
 746         pixels[4] += block[4];
 747         pixels[5] += block[5];
 748         pixels[6] += block[6];
 749         pixels[7] += block[7];
 750         pixels += line_size;
 751         block += 8;
 752     }
 753 }
 754
 755 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 756 {
 757     int i;
 758     for(i=0;i<4;i++) {
 759         pixels[0] += block[0];
 760         pixels[1] += block[1];
 761         pixels[2] += block[2];
 762         pixels[3] += block[3];
 763         pixels += line_size;
 764         block += 4;
 765     }
 766 }
 767
 768 static int sum_abs_dctelem_c(DCTELEM *block)
 769 {
 770     int sum=0, i;
 771     for(i=0; i<64; i++)
 772         sum+= FFABS(block[i]);
 773     return sum;
 774 }
 775
 776 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 777 {
 778     int i;
 779
 780     for (i = 0; i < h; i++) {
 781         memset(block, value, 16);
 782         block += line_size;
 783     }
 784 }
 785
 786 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 787 {
 788     int i;
 789
 790     for (i = 0; i < h; i++) {
 791         memset(block, value, 8);
 792         block += line_size;
 793     }
 794 }
 795
 796 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
 797 {
 798     int i, j;
 799     uint16_t *dst1 = dst;
 800     uint16_t *dst2 = dst + linesize;
 801
 802     for (j = 0; j < 8; j++) {
 803         for (i = 0; i < 8; i++) {
 804             dst1[i] = dst2[i] = src[i] * 0x0101;
 805         }
 806         src  += 8;
 807         dst1 += linesize;
 808         dst2 += linesize;
 809     }
 810 }
 811
 812 #if 0
 813
 814 #define PIXOP2(OPNAME, OP) \
 815 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 816 {\
 817     int i;\
 818     for(i=0; i<h; i++){\
 819         OP(*((uint64_t*)block), AV_RN64(pixels));\
 820         pixels+=line_size;\
 821         block +=line_size;\
 822     }\
 823 }\
 824 \
 825 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 826 {\
 827     int i;\
 828     for(i=0; i<h; i++){\
 829         const uint64_t a= AV_RN64(pixels  );\
 830         const uint64_t b= AV_RN64(pixels+1);\
 831         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 832         pixels+=line_size;\
 833         block +=line_size;\
 834     }\
 835 }\
 836 \
 837 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 838 {\
 839     int i;\
 840     for(i=0; i<h; i++){\
 841         const uint64_t a= AV_RN64(pixels  );\
 842         const uint64_t b= AV_RN64(pixels+1);\
 843         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 844         pixels+=line_size;\
 845         block +=line_size;\
 846     }\
 847 }\
 848 \
 849 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 850 {\
 851     int i;\
 852     for(i=0; i<h; i++){\
 853         const uint64_t a= AV_RN64(pixels          );\
 854         const uint64_t b= AV_RN64(pixels+line_size);\
 855         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 856         pixels+=line_size;\
 857         block +=line_size;\
 858     }\
 859 }\
 860 \
 861 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 862 {\
 863     int i;\
 864     for(i=0; i<h; i++){\
 865         const uint64_t a= AV_RN64(pixels          );\
 866         const uint64_t b= AV_RN64(pixels+line_size);\
 867         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 868         pixels+=line_size;\
 869         block +=line_size;\
 870     }\
 871 }\
 872 \
 873 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 874 {\
 875         int i;\
 876         const uint64_t a= AV_RN64(pixels  );\
 877         const uint64_t b= AV_RN64(pixels+1);\
 878         uint64_t l0=  (a&0x0303030303030303ULL)\
 879                     + (b&0x0303030303030303ULL)\
 880                     + 0x0202020202020202ULL;\
 881         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 882                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 883         uint64_t l1,h1;\
 884 \
 885         pixels+=line_size;\
 886         for(i=0; i<h; i+=2){\
 887             uint64_t a= AV_RN64(pixels  );\
 888             uint64_t b= AV_RN64(pixels+1);\
 889             l1=  (a&0x0303030303030303ULL)\
 890                + (b&0x0303030303030303ULL);\
 891             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 892               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 893             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 894             pixels+=line_size;\
 895             block +=line_size;\
 896             a= AV_RN64(pixels  );\
 897             b= AV_RN64(pixels+1);\
 898             l0=  (a&0x0303030303030303ULL)\
 899                + (b&0x0303030303030303ULL)\
 900                + 0x0202020202020202ULL;\
 901             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 902               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 903             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 904             pixels+=line_size;\
 905             block +=line_size;\
 906         }\
 907 }\
 908 \
 909 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 910 {\
 911         int i;\
 912         const uint64_t a= AV_RN64(pixels  );\
 913         const uint64_t b= AV_RN64(pixels+1);\
 914         uint64_t l0=  (a&0x0303030303030303ULL)\
 915                     + (b&0x0303030303030303ULL)\
 916                     + 0x0101010101010101ULL;\
 917         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 918                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 919         uint64_t l1,h1;\
 920 \
 921         pixels+=line_size;\
 922         for(i=0; i<h; i+=2){\
 923             uint64_t a= AV_RN64(pixels  );\
 924             uint64_t b= AV_RN64(pixels+1);\
 925             l1=  (a&0x0303030303030303ULL)\
 926                + (b&0x0303030303030303ULL);\
 927             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 928               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 929             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 930             pixels+=line_size;\
 931             block +=line_size;\
 932             a= AV_RN64(pixels  );\
 933             b= AV_RN64(pixels+1);\
 934             l0=  (a&0x0303030303030303ULL)\
 935                + (b&0x0303030303030303ULL)\
 936                + 0x0101010101010101ULL;\
 937             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 938               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 939             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 940             pixels+=line_size;\
 941             block +=line_size;\
 942         }\
 943 }\
 944 \
 945 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 946 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 947 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 948 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 949 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 950 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 951 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 952
 953 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 954 #else // 64 bit variant
 955
 956 #define PIXOP2(OPNAME, OP) \
 957 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 958     int i;\
 959     for(i=0; i<h; i++){\
 960         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 961         pixels+=line_size;\
 962         block +=line_size;\
 963     }\
 964 }\
 965 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 966     int i;\
 967     for(i=0; i<h; i++){\
 968         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 969         pixels+=line_size;\
 970         block +=line_size;\
 971     }\
 972 }\
 973 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 974     int i;\
 975     for(i=0; i<h; i++){\
 976         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 977         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 978         pixels+=line_size;\
 979         block +=line_size;\
 980     }\
 981 }\
 982 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 983     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 984 }\
 985 \
 986 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 987                                                 int src_stride1, int src_stride2, int h){\
 988     int i;\
 989     for(i=0; i<h; i++){\
 990         uint32_t a,b;\
 991         a= AV_RN32(&src1[i*src_stride1  ]);\
 992         b= AV_RN32(&src2[i*src_stride2  ]);\
 993         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 994         a= AV_RN32(&src1[i*src_stride1+4]);\
 995         b= AV_RN32(&src2[i*src_stride2+4]);\
 996         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 997     }\
 998 }\
 999 \
1000 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1001                                                 int src_stride1, int src_stride2, int h){\
1002     int i;\
1003     for(i=0; i<h; i++){\
1004         uint32_t a,b;\
1005         a= AV_RN32(&src1[i*src_stride1  ]);\
1006         b= AV_RN32(&src2[i*src_stride2  ]);\
1007         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
1008         a= AV_RN32(&src1[i*src_stride1+4]);\
1009         b= AV_RN32(&src2[i*src_stride2+4]);\
1010         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
1011     }\
1012 }\
1013 \
1014 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1015                                                 int src_stride1, int src_stride2, int h){\
1016     int i;\
1017     for(i=0; i<h; i++){\
1018         uint32_t a,b;\
1019         a= AV_RN32(&src1[i*src_stride1  ]);\
1020         b= AV_RN32(&src2[i*src_stride2  ]);\
1021         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
1022     }\
1023 }\
1024 \
1025 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1026                                                 int src_stride1, int src_stride2, int h){\
1027     int i;\
1028     for(i=0; i<h; i++){\
1029         uint32_t a,b;\
1030         a= AV_RN16(&src1[i*src_stride1  ]);\
1031         b= AV_RN16(&src2[i*src_stride2  ]);\
1032         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
1033     }\
1034 }\
1035 \
1036 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1037                                                 int src_stride1, int src_stride2, int h){\
1038     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
1039     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1040 }\
1041 \
1042 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1043                                                 int src_stride1, int src_stride2, int h){\
1044     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
1045     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1046 }\
1047 \
1048 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1049     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1050 }\
1051 \
1052 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1053     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1054 }\
1055 \
1056 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1057     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1058 }\
1059 \
1060 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1061     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1062 }\
1063 \
1064 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1065                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1066     int i;\
1067     for(i=0; i<h; i++){\
1068         uint32_t a, b, c, d, l0, l1, h0, h1;\
1069         a= AV_RN32(&src1[i*src_stride1]);\
1070         b= AV_RN32(&src2[i*src_stride2]);\
1071         c= AV_RN32(&src3[i*src_stride3]);\
1072         d= AV_RN32(&src4[i*src_stride4]);\
1073         l0=  (a&0x03030303UL)\
1074            + (b&0x03030303UL)\
1075            + 0x02020202UL;\
1076         h0= ((a&0xFCFCFCFCUL)>>2)\
1077           + ((b&0xFCFCFCFCUL)>>2);\
1078         l1=  (c&0x03030303UL)\
1079            + (d&0x03030303UL);\
1080         h1= ((c&0xFCFCFCFCUL)>>2)\
1081           + ((d&0xFCFCFCFCUL)>>2);\
1082         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1083         a= AV_RN32(&src1[i*src_stride1+4]);\
1084         b= AV_RN32(&src2[i*src_stride2+4]);\
1085         c= AV_RN32(&src3[i*src_stride3+4]);\
1086         d= AV_RN32(&src4[i*src_stride4+4]);\
1087         l0=  (a&0x03030303UL)\
1088            + (b&0x03030303UL)\
1089            + 0x02020202UL;\
1090         h0= ((a&0xFCFCFCFCUL)>>2)\
1091           + ((b&0xFCFCFCFCUL)>>2);\
1092         l1=  (c&0x03030303UL)\
1093            + (d&0x03030303UL);\
1094         h1= ((c&0xFCFCFCFCUL)>>2)\
1095           + ((d&0xFCFCFCFCUL)>>2);\
1096         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1097     }\
1098 }\
1099 \
1100 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1101     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1102 }\
1103 \
1104 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1105     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1106 }\
1107 \
1108 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1109     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1110 }\
1111 \
1112 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1113     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1114 }\
1115 \
1116 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1117                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1118     int i;\
1119     for(i=0; i<h; i++){\
1120         uint32_t a, b, c, d, l0, l1, h0, h1;\
1121         a= AV_RN32(&src1[i*src_stride1]);\
1122         b= AV_RN32(&src2[i*src_stride2]);\
1123         c= AV_RN32(&src3[i*src_stride3]);\
1124         d= AV_RN32(&src4[i*src_stride4]);\
1125         l0=  (a&0x03030303UL)\
1126            + (b&0x03030303UL)\
1127            + 0x01010101UL;\
1128         h0= ((a&0xFCFCFCFCUL)>>2)\
1129           + ((b&0xFCFCFCFCUL)>>2);\
1130         l1=  (c&0x03030303UL)\
1131            + (d&0x03030303UL);\
1132         h1= ((c&0xFCFCFCFCUL)>>2)\
1133           + ((d&0xFCFCFCFCUL)>>2);\
1134         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1135         a= AV_RN32(&src1[i*src_stride1+4]);\
1136         b= AV_RN32(&src2[i*src_stride2+4]);\
1137         c= AV_RN32(&src3[i*src_stride3+4]);\
1138         d= AV_RN32(&src4[i*src_stride4+4]);\
1139         l0=  (a&0x03030303UL)\
1140            + (b&0x03030303UL)\
1141            + 0x01010101UL;\
1142         h0= ((a&0xFCFCFCFCUL)>>2)\
1143           + ((b&0xFCFCFCFCUL)>>2);\
1144         l1=  (c&0x03030303UL)\
1145            + (d&0x03030303UL);\
1146         h1= ((c&0xFCFCFCFCUL)>>2)\
1147           + ((d&0xFCFCFCFCUL)>>2);\
1148         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1149     }\
1150 }\
1151 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1152                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1153     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1154     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1155 }\
1156 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1157                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1158     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1159     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1160 }\
1161 \
1162 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1163 {\
1164         int i, a0, b0, a1, b1;\
1165         a0= pixels[0];\
1166         b0= pixels[1] + 2;\
1167         a0 += b0;\
1168         b0 += pixels[2];\
1169 \
1170         pixels+=line_size;\
1171         for(i=0; i<h; i+=2){\
1172             a1= pixels[0];\
1173             b1= pixels[1];\
1174             a1 += b1;\
1175             b1 += pixels[2];\
1176 \
1177             block[0]= (a1+a0)>>2; /* FIXME non put */\
1178             block[1]= (b1+b0)>>2;\
1179 \
1180             pixels+=line_size;\
1181             block +=line_size;\
1182 \
1183             a0= pixels[0];\
1184             b0= pixels[1] + 2;\
1185             a0 += b0;\
1186             b0 += pixels[2];\
1187 \
1188             block[0]= (a1+a0)>>2;\
1189             block[1]= (b1+b0)>>2;\
1190             pixels+=line_size;\
1191             block +=line_size;\
1192         }\
1193 }\
1194 \
1195 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1196 {\
1197         int i;\
1198         const uint32_t a= AV_RN32(pixels  );\
1199         const uint32_t b= AV_RN32(pixels+1);\
1200         uint32_t l0=  (a&0x03030303UL)\
1201                     + (b&0x03030303UL)\
1202                     + 0x02020202UL;\
1203         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1204                    + ((b&0xFCFCFCFCUL)>>2);\
1205         uint32_t l1,h1;\
1206 \
1207         pixels+=line_size;\
1208         for(i=0; i<h; i+=2){\
1209             uint32_t a= AV_RN32(pixels  );\
1210             uint32_t b= AV_RN32(pixels+1);\
1211             l1=  (a&0x03030303UL)\
1212                + (b&0x03030303UL);\
1213             h1= ((a&0xFCFCFCFCUL)>>2)\
1214               + ((b&0xFCFCFCFCUL)>>2);\
1215             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1216             pixels+=line_size;\
1217             block +=line_size;\
1218             a= AV_RN32(pixels  );\
1219             b= AV_RN32(pixels+1);\
1220             l0=  (a&0x03030303UL)\
1221                + (b&0x03030303UL)\
1222                + 0x02020202UL;\
1223             h0= ((a&0xFCFCFCFCUL)>>2)\
1224               + ((b&0xFCFCFCFCUL)>>2);\
1225             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1226             pixels+=line_size;\
1227             block +=line_size;\
1228         }\
1229 }\
1230 \
1231 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1232 {\
1233     int j;\
1234     for(j=0; j<2; j++){\
1235         int i;\
1236         const uint32_t a= AV_RN32(pixels  );\
1237         const uint32_t b= AV_RN32(pixels+1);\
1238         uint32_t l0=  (a&0x03030303UL)\
1239                     + (b&0x03030303UL)\
1240                     + 0x02020202UL;\
1241         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1242                    + ((b&0xFCFCFCFCUL)>>2);\
1243         uint32_t l1,h1;\
1244 \
1245         pixels+=line_size;\
1246         for(i=0; i<h; i+=2){\
1247             uint32_t a= AV_RN32(pixels  );\
1248             uint32_t b= AV_RN32(pixels+1);\
1249             l1=  (a&0x03030303UL)\
1250                + (b&0x03030303UL);\
1251             h1= ((a&0xFCFCFCFCUL)>>2)\
1252               + ((b&0xFCFCFCFCUL)>>2);\
1253             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1254             pixels+=line_size;\
1255             block +=line_size;\
1256             a= AV_RN32(pixels  );\
1257             b= AV_RN32(pixels+1);\
1258             l0=  (a&0x03030303UL)\
1259                + (b&0x03030303UL)\
1260                + 0x02020202UL;\
1261             h0= ((a&0xFCFCFCFCUL)>>2)\
1262               + ((b&0xFCFCFCFCUL)>>2);\
1263             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1264             pixels+=line_size;\
1265             block +=line_size;\
1266         }\
1267         pixels+=4-line_size*(h+1);\
1268         block +=4-line_size*h;\
1269     }\
1270 }\
1271 \
1272 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1273 {\
1274     int j;\
1275     for(j=0; j<2; j++){\
1276         int i;\
1277         const uint32_t a= AV_RN32(pixels  );\
1278         const uint32_t b= AV_RN32(pixels+1);\
1279         uint32_t l0=  (a&0x03030303UL)\
1280                     + (b&0x03030303UL)\
1281                     + 0x01010101UL;\
1282         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1283                    + ((b&0xFCFCFCFCUL)>>2);\
1284         uint32_t l1,h1;\
1285 \
1286         pixels+=line_size;\
1287         for(i=0; i<h; i+=2){\
1288             uint32_t a= AV_RN32(pixels  );\
1289             uint32_t b= AV_RN32(pixels+1);\
1290             l1=  (a&0x03030303UL)\
1291                + (b&0x03030303UL);\
1292             h1= ((a&0xFCFCFCFCUL)>>2)\
1293               + ((b&0xFCFCFCFCUL)>>2);\
1294             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1295             pixels+=line_size;\
1296             block +=line_size;\
1297             a= AV_RN32(pixels  );\
1298             b= AV_RN32(pixels+1);\
1299             l0=  (a&0x03030303UL)\
1300                + (b&0x03030303UL)\
1301                + 0x01010101UL;\
1302             h0= ((a&0xFCFCFCFCUL)>>2)\
1303               + ((b&0xFCFCFCFCUL)>>2);\
1304             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1305             pixels+=line_size;\
1306             block +=line_size;\
1307         }\
1308         pixels+=4-line_size*(h+1);\
1309         block +=4-line_size*h;\
1310     }\
1311 }\
1312 \
1313 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1314 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1315 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1316 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1317 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1318 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1319 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1320 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1321
1322 #define op_avg(a, b) a = rnd_avg32(a, b)
1323 #endif
1324 #define op_put(a, b) a = b
1325
1326 PIXOP2(avg, op_avg)
1327 PIXOP2(put, op_put)
1328 #undef op_avg
1329 #undef op_put
1330
1331 #define avg2(a,b) ((a+b+1)>>1)
1332 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1333
1334 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1335     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1336 }
1337
1338 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1339     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1340 }
1341
1342 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1343 {
1344     const int A=(16-x16)*(16-y16);
1345     const int B=(   x16)*(16-y16);
1346     const int C=(16-x16)*(   y16);
1347     const int D=(   x16)*(   y16);
1348     int i;
1349
1350     for(i=0; i<h; i++)
1351     {
1352         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1353         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1354         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1355         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1356         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1357         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1358         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1359         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1360         dst+= stride;
1361         src+= stride;
1362     }
1363 }
1364
1365 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1366                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1367 {
1368     int y, vx, vy;
1369     const int s= 1<<shift;
1370
1371     width--;
1372     height--;
1373
1374     for(y=0; y<h; y++){
1375         int x;
1376
1377         vx= ox;
1378         vy= oy;
1379         for(x=0; x<8; x++){ //XXX FIXME optimize
1380             int src_x, src_y, frac_x, frac_y, index;
1381
1382             src_x= vx>>16;
1383             src_y= vy>>16;
1384             frac_x= src_x&(s-1);
1385             frac_y= src_y&(s-1);
1386             src_x>>=shift;
1387             src_y>>=shift;
1388
1389             if((unsigned)src_x < width){
1390                 if((unsigned)src_y < height){
1391                     index= src_x + src_y*stride;
1392                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1393                                            + src[index       +1]*   frac_x )*(s-frac_y)
1394                                         + (  src[index+stride  ]*(s-frac_x)
1395                                            + src[index+stride+1]*   frac_x )*   frac_y
1396                                         + r)>>(shift*2);
1397                 }else{
1398                     index= src_x + av_clip(src_y, 0, height)*stride;
1399                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1400                                           + src[index       +1]*   frac_x )*s
1401                                         + r)>>(shift*2);
1402                 }
1403             }else{
1404                 if((unsigned)src_y < height){
1405                     index= av_clip(src_x, 0, width) + src_y*stride;
1406                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1407                                            + src[index+stride  ]*   frac_y )*s
1408                                         + r)>>(shift*2);
1409                 }else{
1410                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1411                     dst[y*stride + x]=    src[index         ];
1412                 }
1413             }
1414
1415             vx+= dxx;
1416             vy+= dyx;
1417         }
1418         ox += dxy;
1419         oy += dyy;
1420     }
1421 }
1422
1423 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1424     switch(width){
1425     case 2: put_pixels2_c (dst, src, stride, height); break;
1426     case 4: put_pixels4_c (dst, src, stride, height); break;
1427     case 8: put_pixels8_c (dst, src, stride, height); break;
1428     case 16:put_pixels16_c(dst, src, stride, height); break;
1429     }
1430 }
1431
1432 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1433     int i,j;
1434     for (i=0; i < height; i++) {
1435       for (j=0; j < width; j++) {
1436         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1437       }
1438       src += stride;
1439       dst += stride;
1440     }
1441 }
1442
1443 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1444     int i,j;
1445     for (i=0; i < height; i++) {
1446       for (j=0; j < width; j++) {
1447         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1448       }
1449       src += stride;
1450       dst += stride;
1451     }
1452 }
1453
1454 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1455     int i,j;
1456     for (i=0; i < height; i++) {
1457       for (j=0; j < width; j++) {
1458         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1459       }
1460       src += stride;
1461       dst += stride;
1462     }
1463 }
1464
1465 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1466     int i,j;
1467     for (i=0; i < height; i++) {
1468       for (j=0; j < width; j++) {
1469         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1470       }
1471       src += stride;
1472       dst += stride;
1473     }
1474 }
1475
1476 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1477     int i,j;
1478     for (i=0; i < height; i++) {
1479       for (j=0; j < width; j++) {
1480         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1481       }
1482       src += stride;
1483       dst += stride;
1484     }
1485 }
1486
1487 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1488     int i,j;
1489     for (i=0; i < height; i++) {
1490       for (j=0; j < width; j++) {
1491         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1492       }
1493       src += stride;
1494       dst += stride;
1495     }
1496 }
1497
1498 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1499     int i,j;
1500     for (i=0; i < height; i++) {
1501       for (j=0; j < width; j++) {
1502         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1503       }
1504       src += stride;
1505       dst += stride;
1506     }
1507 }
1508
1509 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1510     int i,j;
1511     for (i=0; i < height; i++) {
1512       for (j=0; j < width; j++) {
1513         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1514       }
1515       src += stride;
1516       dst += stride;
1517     }
1518 }
1519
1520 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1521     switch(width){
1522     case 2: avg_pixels2_c (dst, src, stride, height); break;
1523     case 4: avg_pixels4_c (dst, src, stride, height); break;
1524     case 8: avg_pixels8_c (dst, src, stride, height); break;
1525     case 16:avg_pixels16_c(dst, src, stride, height); break;
1526     }
1527 }
1528
1529 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1530     int i,j;
1531     for (i=0; i < height; i++) {
1532       for (j=0; j < width; j++) {
1533         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1534       }
1535       src += stride;
1536       dst += stride;
1537     }
1538 }
1539
1540 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1541     int i,j;
1542     for (i=0; i < height; i++) {
1543       for (j=0; j < width; j++) {
1544         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1545       }
1546       src += stride;
1547       dst += stride;
1548     }
1549 }
1550
1551 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1552     int i,j;
1553     for (i=0; i < height; i++) {
1554       for (j=0; j < width; j++) {
1555         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1556       }
1557       src += stride;
1558       dst += stride;
1559     }
1560 }
1561
1562 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1563     int i,j;
1564     for (i=0; i < height; i++) {
1565       for (j=0; j < width; j++) {
1566         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1567       }
1568       src += stride;
1569       dst += stride;
1570     }
1571 }
1572
1573 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1574     int i,j;
1575     for (i=0; i < height; i++) {
1576       for (j=0; j < width; j++) {
1577         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1578       }
1579       src += stride;
1580       dst += stride;
1581     }
1582 }
1583
1584 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1585     int i,j;
1586     for (i=0; i < height; i++) {
1587       for (j=0; j < width; j++) {
1588         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1589       }
1590       src += stride;
1591       dst += stride;
1592     }
1593 }
1594
1595 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1596     int i,j;
1597     for (i=0; i < height; i++) {
1598       for (j=0; j < width; j++) {
1599         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1600       }
1601       src += stride;
1602       dst += stride;
1603     }
1604 }
1605
1606 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1607     int i,j;
1608     for (i=0; i < height; i++) {
1609       for (j=0; j < width; j++) {
1610         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1611       }
1612       src += stride;
1613       dst += stride;
1614     }
1615 }
1616 #if 0
1617 #define TPEL_WIDTH(width)\
1618 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1619     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1620 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1621     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1622 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1623     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1624 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1625     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1626 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1627     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1628 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1629     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1630 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1631     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1632 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1633     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1634 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1635     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1636 #endif
1637
1638 #define H264_CHROMA_MC(OPNAME, OP)\
1639 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1640     const int A=(8-x)*(8-y);\
1641     const int B=(  x)*(8-y);\
1642     const int C=(8-x)*(  y);\
1643     const int D=(  x)*(  y);\
1644     int i;\
1645     \
1646     assert(x<8 && y<8 && x>=0 && y>=0);\
1647 \
1648     if(D){\
1649         for(i=0; i<h; i++){\
1650             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1651             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1652             dst+= stride;\
1653             src+= stride;\
1654         }\
1655     }else{\
1656         const int E= B+C;\
1657         const int step= C ? stride : 1;\
1658         for(i=0; i<h; i++){\
1659             OP(dst[0], (A*src[0] + E*src[step+0]));\
1660             OP(dst[1], (A*src[1] + E*src[step+1]));\
1661             dst+= stride;\
1662             src+= stride;\
1663         }\
1664     }\
1665 }\
1666 \
1667 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1668     const int A=(8-x)*(8-y);\
1669     const int B=(  x)*(8-y);\
1670     const int C=(8-x)*(  y);\
1671     const int D=(  x)*(  y);\
1672     int i;\
1673     \
1674     assert(x<8 && y<8 && x>=0 && y>=0);\
1675 \
1676     if(D){\
1677         for(i=0; i<h; i++){\
1678             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1679             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1680             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1681             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1682             dst+= stride;\
1683             src+= stride;\
1684         }\
1685     }else{\
1686         const int E= B+C;\
1687         const int step= C ? stride : 1;\
1688         for(i=0; i<h; i++){\
1689             OP(dst[0], (A*src[0] + E*src[step+0]));\
1690             OP(dst[1], (A*src[1] + E*src[step+1]));\
1691             OP(dst[2], (A*src[2] + E*src[step+2]));\
1692             OP(dst[3], (A*src[3] + E*src[step+3]));\
1693             dst+= stride;\
1694             src+= stride;\
1695         }\
1696     }\
1697 }\
1698 \
1699 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1700     const int A=(8-x)*(8-y);\
1701     const int B=(  x)*(8-y);\
1702     const int C=(8-x)*(  y);\
1703     const int D=(  x)*(  y);\
1704     int i;\
1705     \
1706     assert(x<8 && y<8 && x>=0 && y>=0);\
1707 \
1708     if(D){\
1709         for(i=0; i<h; i++){\
1710             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1711             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1712             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1713             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1714             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1715             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1716             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1717             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1718             dst+= stride;\
1719             src+= stride;\
1720         }\
1721     }else{\
1722         const int E= B+C;\
1723         const int step= C ? stride : 1;\
1724         for(i=0; i<h; i++){\
1725             OP(dst[0], (A*src[0] + E*src[step+0]));\
1726             OP(dst[1], (A*src[1] + E*src[step+1]));\
1727             OP(dst[2], (A*src[2] + E*src[step+2]));\
1728             OP(dst[3], (A*src[3] + E*src[step+3]));\
1729             OP(dst[4], (A*src[4] + E*src[step+4]));\
1730             OP(dst[5], (A*src[5] + E*src[step+5]));\
1731             OP(dst[6], (A*src[6] + E*src[step+6]));\
1732             OP(dst[7], (A*src[7] + E*src[step+7]));\
1733             dst+= stride;\
1734             src+= stride;\
1735         }\
1736     }\
1737 }
1738
1739 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1740 #define op_put(a, b) a = (((b) + 32)>>6)
1741
1742 H264_CHROMA_MC(put_       , op_put)
1743 H264_CHROMA_MC(avg_       , op_avg)
1744 #undef op_avg
1745 #undef op_put
1746
1747 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1748     const int A=(8-x)*(8-y);
1749     const int B=(  x)*(8-y);
1750     const int C=(8-x)*(  y);
1751     const int D=(  x)*(  y);
1752     int i;
1753
1754     assert(x<8 && y<8 && x>=0 && y>=0);
1755
1756     for(i=0; i<h; i++)
1757     {
1758         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1759         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1760         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1761         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1762         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1763         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1764         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1765         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1766         dst+= stride;
1767         src+= stride;
1768     }
1769 }
1770
1771 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1772     const int A=(8-x)*(8-y);
1773     const int B=(  x)*(8-y);
1774     const int C=(8-x)*(  y);
1775     const int D=(  x)*(  y);
1776     int i;
1777
1778     assert(x<8 && y<8 && x>=0 && y>=0);
1779
1780     for(i=0; i<h; i++)
1781     {
1782         dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1783         dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1784         dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1785         dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1786         dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1787         dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1788         dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1789         dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1790         dst+= stride;
1791         src+= stride;
1792     }
1793 }
1794
1795 #define QPEL_MC(r, OPNAME, RND, OP) \
1796 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1797     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1798     int i;\
1799     for(i=0; i<h; i++)\
1800     {\
1801         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1802         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1803         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1804         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1805         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1806         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1807         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1808         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1809         dst+=dstStride;\
1810         src+=srcStride;\
1811     }\
1812 }\
1813 \
1814 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1815     const int w=8;\
1816     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1817     int i;\
1818     for(i=0; i<w; i++)\
1819     {\
1820         const int src0= src[0*srcStride];\
1821         const int src1= src[1*srcStride];\
1822         const int src2= src[2*srcStride];\
1823         const int src3= src[3*srcStride];\
1824         const int src4= src[4*srcStride];\
1825         const int src5= src[5*srcStride];\
1826         const int src6= src[6*srcStride];\
1827         const int src7= src[7*srcStride];\
1828         const int src8= src[8*srcStride];\
1829         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1830         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1831         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1832         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1833         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1834         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1835         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1836         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1837         dst++;\
1838         src++;\
1839     }\
1840 }\
1841 \
1842 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1843     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1844     int i;\
1845     \
1846     for(i=0; i<h; i++)\
1847     {\
1848         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1849         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1850         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1851         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1852         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1853         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1854         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1855         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1856         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1857         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1858         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1859         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1860         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1861         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1862         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1863         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1864         dst+=dstStride;\
1865         src+=srcStride;\
1866     }\
1867 }\
1868 \
1869 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1870     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1871     int i;\
1872     const int w=16;\
1873     for(i=0; i<w; i++)\
1874     {\
1875         const int src0= src[0*srcStride];\
1876         const int src1= src[1*srcStride];\
1877         const int src2= src[2*srcStride];\
1878         const int src3= src[3*srcStride];\
1879         const int src4= src[4*srcStride];\
1880         const int src5= src[5*srcStride];\
1881         const int src6= src[6*srcStride];\
1882         const int src7= src[7*srcStride];\
1883         const int src8= src[8*srcStride];\
1884         const int src9= src[9*srcStride];\
1885         const int src10= src[10*srcStride];\
1886         const int src11= src[11*srcStride];\
1887         const int src12= src[12*srcStride];\
1888         const int src13= src[13*srcStride];\
1889         const int src14= src[14*srcStride];\
1890         const int src15= src[15*srcStride];\
1891         const int src16= src[16*srcStride];\
1892         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1893         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1894         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1895         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1896         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1897         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1898         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1899         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1900         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1901         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1902         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1903         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1904         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1905         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1906         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1907         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1908         dst++;\
1909         src++;\
1910     }\
1911 }\
1912 \
1913 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1914     OPNAME ## pixels8_c(dst, src, stride, 8);\
1915 }\
1916 \
1917 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1918     uint8_t half[64];\
1919     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1920     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1921 }\
1922 \
1923 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1924     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1925 }\
1926 \
1927 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1928     uint8_t half[64];\
1929     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1930     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1931 }\
1932 \
1933 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1934     uint8_t full[16*9];\
1935     uint8_t half[64];\
1936     copy_block9(full, src, 16, stride, 9);\
1937     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1938     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1939 }\
1940 \
1941 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1942     uint8_t full[16*9];\
1943     copy_block9(full, src, 16, stride, 9);\
1944     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1945 }\
1946 \
1947 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1948     uint8_t full[16*9];\
1949     uint8_t half[64];\
1950     copy_block9(full, src, 16, stride, 9);\
1951     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1952     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1953 }\
1954 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1955     uint8_t full[16*9];\
1956     uint8_t halfH[72];\
1957     uint8_t halfV[64];\
1958     uint8_t halfHV[64];\
1959     copy_block9(full, src, 16, stride, 9);\
1960     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1961     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1962     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1963     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1964 }\
1965 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1966     uint8_t full[16*9];\
1967     uint8_t halfH[72];\
1968     uint8_t halfHV[64];\
1969     copy_block9(full, src, 16, stride, 9);\
1970     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1971     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1972     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1973     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1974 }\
1975 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1976     uint8_t full[16*9];\
1977     uint8_t halfH[72];\
1978     uint8_t halfV[64];\
1979     uint8_t halfHV[64];\
1980     copy_block9(full, src, 16, stride, 9);\
1981     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1982     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1983     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1984     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1985 }\
1986 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1987     uint8_t full[16*9];\
1988     uint8_t halfH[72];\
1989     uint8_t halfHV[64];\
1990     copy_block9(full, src, 16, stride, 9);\
1991     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1992     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1993     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1994     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1995 }\
1996 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1997     uint8_t full[16*9];\
1998     uint8_t halfH[72];\
1999     uint8_t halfV[64];\
2000     uint8_t halfHV[64];\
2001     copy_block9(full, src, 16, stride, 9);\
2002     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2003     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
2004     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2005     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
2006 }\
2007 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2008     uint8_t full[16*9];\
2009     uint8_t halfH[72];\
2010     uint8_t halfHV[64];\
2011     copy_block9(full, src, 16, stride, 9);\
2012     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2013     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2014     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2015     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2016 }\
2017 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2018     uint8_t full[16*9];\
2019     uint8_t halfH[72];\
2020     uint8_t halfV[64];\
2021     uint8_t halfHV[64];\
2022     copy_block9(full, src, 16, stride, 9);\
2023     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
2024     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2025     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2026     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
2027 }\
2028 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2029     uint8_t full[16*9];\
2030     uint8_t halfH[72];\
2031     uint8_t halfHV[64];\
2032     copy_block9(full, src, 16, stride, 9);\
2033     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2034     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2035     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2036     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2037 }\
2038 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2039     uint8_t halfH[72];\
2040     uint8_t halfHV[64];\
2041     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2042     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2043     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
2044 }\
2045 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2046     uint8_t halfH[72];\
2047     uint8_t halfHV[64];\
2048     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2049     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2050     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2051 }\
2052 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2053     uint8_t full[16*9];\
2054     uint8_t halfH[72];\
2055     uint8_t halfV[64];\
2056     uint8_t halfHV[64];\
2057     copy_block9(full, src, 16, stride, 9);\
2058     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2059     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
2060     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2061     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2062 }\
2063 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2064     uint8_t full[16*9];\
2065     uint8_t halfH[72];\
2066     copy_block9(full, src, 16, stride, 9);\
2067     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2068     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2069     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2070 }\
2071 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2072     uint8_t full[16*9];\
2073     uint8_t halfH[72];\
2074     uint8_t halfV[64];\
2075     uint8_t halfHV[64];\
2076     copy_block9(full, src, 16, stride, 9);\
2077     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2078     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2079     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2080     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2081 }\
2082 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2083     uint8_t full[16*9];\
2084     uint8_t halfH[72];\
2085     copy_block9(full, src, 16, stride, 9);\
2086     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2087     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2088     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2089 }\
2090 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2091     uint8_t halfH[72];\
2092     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2093     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2094 }\
2095 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2096     OPNAME ## pixels16_c(dst, src, stride, 16);\
2097 }\
2098 \
2099 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2100     uint8_t half[256];\
2101     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2102     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2103 }\
2104 \
2105 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2106     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2107 }\
2108 \
2109 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2110     uint8_t half[256];\
2111     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2112     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2113 }\
2114 \
2115 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2116     uint8_t full[24*17];\
2117     uint8_t half[256];\
2118     copy_block17(full, src, 24, stride, 17);\
2119     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2120     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2121 }\
2122 \
2123 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2124     uint8_t full[24*17];\
2125     copy_block17(full, src, 24, stride, 17);\
2126     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2127 }\
2128 \
2129 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2130     uint8_t full[24*17];\
2131     uint8_t half[256];\
2132     copy_block17(full, src, 24, stride, 17);\
2133     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2134     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2135 }\
2136 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2137     uint8_t full[24*17];\
2138     uint8_t halfH[272];\
2139     uint8_t halfV[256];\
2140     uint8_t halfHV[256];\
2141     copy_block17(full, src, 24, stride, 17);\
2142     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2143     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2144     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2145     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2146 }\
2147 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2148     uint8_t full[24*17];\
2149     uint8_t halfH[272];\
2150     uint8_t halfHV[256];\
2151     copy_block17(full, src, 24, stride, 17);\
2152     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2153     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2154     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2155     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2156 }\
2157 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2158     uint8_t full[24*17];\
2159     uint8_t halfH[272];\
2160     uint8_t halfV[256];\
2161     uint8_t halfHV[256];\
2162     copy_block17(full, src, 24, stride, 17);\
2163     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2164     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2165     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2166     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2167 }\
2168 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2169     uint8_t full[24*17];\
2170     uint8_t halfH[272];\
2171     uint8_t halfHV[256];\
2172     copy_block17(full, src, 24, stride, 17);\
2173     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2174     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2175     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2176     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2177 }\
2178 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2179     uint8_t full[24*17];\
2180     uint8_t halfH[272];\
2181     uint8_t halfV[256];\
2182     uint8_t halfHV[256];\
2183     copy_block17(full, src, 24, stride, 17);\
2184     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2185     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2186     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2187     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2188 }\
2189 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2190     uint8_t full[24*17];\
2191     uint8_t halfH[272];\
2192     uint8_t halfHV[256];\
2193     copy_block17(full, src, 24, stride, 17);\
2194     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2195     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2196     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2197     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2198 }\
2199 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2200     uint8_t full[24*17];\
2201     uint8_t halfH[272];\
2202     uint8_t halfV[256];\
2203     uint8_t halfHV[256];\
2204     copy_block17(full, src, 24, stride, 17);\
2205     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2206     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2207     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2208     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2209 }\
2210 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2211     uint8_t full[24*17];\
2212     uint8_t halfH[272];\
2213     uint8_t halfHV[256];\
2214     copy_block17(full, src, 24, stride, 17);\
2215     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2216     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2217     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2218     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2219 }\
2220 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2221     uint8_t halfH[272];\
2222     uint8_t halfHV[256];\
2223     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2224     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2225     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2226 }\
2227 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2228     uint8_t halfH[272];\
2229     uint8_t halfHV[256];\
2230     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2231     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2232     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2233 }\
2234 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2235     uint8_t full[24*17];\
2236     uint8_t halfH[272];\
2237     uint8_t halfV[256];\
2238     uint8_t halfHV[256];\
2239     copy_block17(full, src, 24, stride, 17);\
2240     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2241     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2242     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2243     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2244 }\
2245 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2246     uint8_t full[24*17];\
2247     uint8_t halfH[272];\
2248     copy_block17(full, src, 24, stride, 17);\
2249     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2250     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2251     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2252 }\
2253 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2254     uint8_t full[24*17];\
2255     uint8_t halfH[272];\
2256     uint8_t halfV[256];\
2257     uint8_t halfHV[256];\
2258     copy_block17(full, src, 24, stride, 17);\
2259     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2260     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2261     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2262     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2263 }\
2264 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2265     uint8_t full[24*17];\
2266     uint8_t halfH[272];\
2267     copy_block17(full, src, 24, stride, 17);\
2268     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2269     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2270     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2271 }\
2272 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2273     uint8_t halfH[272];\
2274     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2275     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2276 }
2277
2278 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2279 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2280 #define op_put(a, b) a = cm[((b) + 16)>>5]
2281 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2282
2283 QPEL_MC(0, put_       , _       , op_put)
2284 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2285 QPEL_MC(0, avg_       , _       , op_avg)
2286 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2287 #undef op_avg
2288 #undef op_avg_no_rnd
2289 #undef op_put
2290 #undef op_put_no_rnd
2291
2292 #if 1
2293 #define H264_LOWPASS(OPNAME, OP, OP2) \
2294 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2295     const int h=2;\
2296     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2297     int i;\
2298     for(i=0; i<h; i++)\
2299     {\
2300         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2301         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2302         dst+=dstStride;\
2303         src+=srcStride;\
2304     }\
2305 }\
2306 \
2307 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2308     const int w=2;\
2309     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2310     int i;\
2311     for(i=0; i<w; i++)\
2312     {\
2313         const int srcB= src[-2*srcStride];\
2314         const int srcA= src[-1*srcStride];\
2315         const int src0= src[0 *srcStride];\
2316         const int src1= src[1 *srcStride];\
2317         const int src2= src[2 *srcStride];\
2318         const int src3= src[3 *srcStride];\
2319         const int src4= src[4 *srcStride];\
2320         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2321         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2322         dst++;\
2323         src++;\
2324     }\
2325 }\
2326 \
2327 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2328     const int h=2;\
2329     const int w=2;\
2330     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2331     int i;\
2332     src -= 2*srcStride;\
2333     for(i=0; i<h+5; i++)\
2334     {\
2335         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2336         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2337         tmp+=tmpStride;\
2338         src+=srcStride;\
2339     }\
2340     tmp -= tmpStride*(h+5-2);\
2341     for(i=0; i<w; i++)\
2342     {\
2343         const int tmpB= tmp[-2*tmpStride];\
2344         const int tmpA= tmp[-1*tmpStride];\
2345         const int tmp0= tmp[0 *tmpStride];\
2346         const int tmp1= tmp[1 *tmpStride];\
2347         const int tmp2= tmp[2 *tmpStride];\
2348         const int tmp3= tmp[3 *tmpStride];\
2349         const int tmp4= tmp[4 *tmpStride];\
2350         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2351         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2352         dst++;\
2353         tmp++;\
2354     }\
2355 }\
2356 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2357     const int h=4;\
2358     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2359     int i;\
2360     for(i=0; i<h; i++)\
2361     {\
2362         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2363         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2364         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2365         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2366         dst+=dstStride;\
2367         src+=srcStride;\
2368     }\
2369 }\
2370 \
2371 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2372     const int w=4;\
2373     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2374     int i;\
2375     for(i=0; i<w; i++)\
2376     {\
2377         const int srcB= src[-2*srcStride];\
2378         const int srcA= src[-1*srcStride];\
2379         const int src0= src[0 *srcStride];\
2380         const int src1= src[1 *srcStride];\
2381         const int src2= src[2 *srcStride];\
2382         const int src3= src[3 *srcStride];\
2383         const int src4= src[4 *srcStride];\
2384         const int src5= src[5 *srcStride];\
2385         const int src6= src[6 *srcStride];\
2386         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2387         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2388         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2389         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2390         dst++;\
2391         src++;\
2392     }\
2393 }\
2394 \
2395 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2396     const int h=4;\
2397     const int w=4;\
2398     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2399     int i;\
2400     src -= 2*srcStride;\
2401     for(i=0; i<h+5; i++)\
2402     {\
2403         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2404         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2405         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2406         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2407         tmp+=tmpStride;\
2408         src+=srcStride;\
2409     }\
2410     tmp -= tmpStride*(h+5-2);\
2411     for(i=0; i<w; i++)\
2412     {\
2413         const int tmpB= tmp[-2*tmpStride];\
2414         const int tmpA= tmp[-1*tmpStride];\
2415         const int tmp0= tmp[0 *tmpStride];\
2416         const int tmp1= tmp[1 *tmpStride];\
2417         const int tmp2= tmp[2 *tmpStride];\
2418         const int tmp3= tmp[3 *tmpStride];\
2419         const int tmp4= tmp[4 *tmpStride];\
2420         const int tmp5= tmp[5 *tmpStride];\
2421         const int tmp6= tmp[6 *tmpStride];\
2422         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2423         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2424         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2425         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2426         dst++;\
2427         tmp++;\
2428     }\
2429 }\
2430 \
2431 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2432     const int h=8;\
2433     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2434     int i;\
2435     for(i=0; i<h; i++)\
2436     {\
2437         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2438         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2439         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2440         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2441         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2442         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2443         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2444         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2445         dst+=dstStride;\
2446         src+=srcStride;\
2447     }\
2448 }\
2449 \
2450 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2451     const int w=8;\
2452     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2453     int i;\
2454     for(i=0; i<w; i++)\
2455     {\
2456         const int srcB= src[-2*srcStride];\
2457         const int srcA= src[-1*srcStride];\
2458         const int src0= src[0 *srcStride];\
2459         const int src1= src[1 *srcStride];\
2460         const int src2= src[2 *srcStride];\
2461         const int src3= src[3 *srcStride];\
2462         const int src4= src[4 *srcStride];\
2463         const int src5= src[5 *srcStride];\
2464         const int src6= src[6 *srcStride];\
2465         const int src7= src[7 *srcStride];\
2466         const int src8= src[8 *srcStride];\
2467         const int src9= src[9 *srcStride];\
2468         const int src10=src[10*srcStride];\
2469         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2470         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2471         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2472         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2473         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2474         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2475         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2476         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2477         dst++;\
2478         src++;\
2479     }\
2480 }\
2481 \
2482 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2483     const int h=8;\
2484     const int w=8;\
2485     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2486     int i;\
2487     src -= 2*srcStride;\
2488     for(i=0; i<h+5; i++)\
2489     {\
2490         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2491         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2492         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2493         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2494         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2495         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2496         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2497         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2498         tmp+=tmpStride;\
2499         src+=srcStride;\
2500     }\
2501     tmp -= tmpStride*(h+5-2);\
2502     for(i=0; i<w; i++)\
2503     {\
2504         const int tmpB= tmp[-2*tmpStride];\
2505         const int tmpA= tmp[-1*tmpStride];\
2506         const int tmp0= tmp[0 *tmpStride];\
2507         const int tmp1= tmp[1 *tmpStride];\
2508         const int tmp2= tmp[2 *tmpStride];\
2509         const int tmp3= tmp[3 *tmpStride];\
2510         const int tmp4= tmp[4 *tmpStride];\
2511         const int tmp5= tmp[5 *tmpStride];\
2512         const int tmp6= tmp[6 *tmpStride];\
2513         const int tmp7= tmp[7 *tmpStride];\
2514         const int tmp8= tmp[8 *tmpStride];\
2515         const int tmp9= tmp[9 *tmpStride];\
2516         const int tmp10=tmp[10*tmpStride];\
2517         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2518         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2519         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2520         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2521         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2522         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2523         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2524         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2525         dst++;\
2526         tmp++;\
2527     }\
2528 }\
2529 \
2530 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2531     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2532     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2533     src += 8*srcStride;\
2534     dst += 8*dstStride;\
2535     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2536     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2537 }\
2538 \
2539 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2540     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2541     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2542     src += 8*srcStride;\
2543     dst += 8*dstStride;\
2544     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2545     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2546 }\
2547 \
2548 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2549     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2550     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2551     src += 8*srcStride;\
2552     dst += 8*dstStride;\
2553     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2554     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2555 }\
2556
2557 #define H264_MC(OPNAME, SIZE) \
2558 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2559     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2560 }\
2561 \
2562 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2563     uint8_t half[SIZE*SIZE];\
2564     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2565     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2566 }\
2567 \
2568 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2569     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2570 }\
2571 \
2572 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2573     uint8_t half[SIZE*SIZE];\
2574     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2575     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2576 }\
2577 \
2578 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2579     uint8_t full[SIZE*(SIZE+5)];\
2580     uint8_t * const full_mid= full + SIZE*2;\
2581     uint8_t half[SIZE*SIZE];\
2582     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2583     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2584     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2585 }\
2586 \
2587 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2588     uint8_t full[SIZE*(SIZE+5)];\
2589     uint8_t * const full_mid= full + SIZE*2;\
2590     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2591     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2592 }\
2593 \
2594 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2595     uint8_t full[SIZE*(SIZE+5)];\
2596     uint8_t * const full_mid= full + SIZE*2;\
2597     uint8_t half[SIZE*SIZE];\
2598     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2599     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2600     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2601 }\
2602 \
2603 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2604     uint8_t full[SIZE*(SIZE+5)];\
2605     uint8_t * const full_mid= full + SIZE*2;\
2606     uint8_t halfH[SIZE*SIZE];\
2607     uint8_t halfV[SIZE*SIZE];\
2608     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2609     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2610     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2611     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2612 }\
2613 \
2614 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2615     uint8_t full[SIZE*(SIZE+5)];\
2616     uint8_t * const full_mid= full + SIZE*2;\
2617     uint8_t halfH[SIZE*SIZE];\
2618     uint8_t halfV[SIZE*SIZE];\
2619     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2620     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2621     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2622     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2623 }\
2624 \
2625 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2626     uint8_t full[SIZE*(SIZE+5)];\
2627     uint8_t * const full_mid= full + SIZE*2;\
2628     uint8_t halfH[SIZE*SIZE];\
2629     uint8_t halfV[SIZE*SIZE];\
2630     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2631     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2632     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2633     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2634 }\
2635 \
2636 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2637     uint8_t full[SIZE*(SIZE+5)];\
2638     uint8_t * const full_mid= full + SIZE*2;\
2639     uint8_t halfH[SIZE*SIZE];\
2640     uint8_t halfV[SIZE*SIZE];\
2641     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2642     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2643     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2644     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2645 }\
2646 \
2647 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2648     int16_t tmp[SIZE*(SIZE+5)];\
2649     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2650 }\
2651 \
2652 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2653     int16_t tmp[SIZE*(SIZE+5)];\
2654     uint8_t halfH[SIZE*SIZE];\
2655     uint8_t halfHV[SIZE*SIZE];\
2656     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2657     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2658     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2659 }\
2660 \
2661 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2662     int16_t tmp[SIZE*(SIZE+5)];\
2663     uint8_t halfH[SIZE*SIZE];\
2664     uint8_t halfHV[SIZE*SIZE];\
2665     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2666     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2667     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2668 }\
2669 \
2670 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2671     uint8_t full[SIZE*(SIZE+5)];\
2672     uint8_t * const full_mid= full + SIZE*2;\
2673     int16_t tmp[SIZE*(SIZE+5)];\
2674     uint8_t halfV[SIZE*SIZE];\
2675     uint8_t halfHV[SIZE*SIZE];\
2676     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2677     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2678     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2679     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2680 }\
2681 \
2682 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2683     uint8_t full[SIZE*(SIZE+5)];\
2684     uint8_t * const full_mid= full + SIZE*2;\
2685     int16_t tmp[SIZE*(SIZE+5)];\
2686     uint8_t halfV[SIZE*SIZE];\
2687     uint8_t halfHV[SIZE*SIZE];\
2688     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2689     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2690     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2691     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2692 }\
2693
2694 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2695 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2696 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2697 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2698 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2699
2700 H264_LOWPASS(put_       , op_put, op2_put)
2701 H264_LOWPASS(avg_       , op_avg, op2_avg)
2702 H264_MC(put_, 2)
2703 H264_MC(put_, 4)
2704 H264_MC(put_, 8)
2705 H264_MC(put_, 16)
2706 H264_MC(avg_, 4)
2707 H264_MC(avg_, 8)
2708 H264_MC(avg_, 16)
2709
2710 #undef op_avg
2711 #undef op_put
2712 #undef op2_avg
2713 #undef op2_put
2714 #endif
2715
2716 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2717 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2718 #define H264_WEIGHT(W,H) \
2719 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2720     int y; \
2721     offset <<= log2_denom; \
2722     if(log2_denom) offset += 1<<(log2_denom-1); \
2723     for(y=0; y<H; y++, block += stride){ \
2724         op_scale1(0); \
2725         op_scale1(1); \
2726         if(W==2) continue; \
2727         op_scale1(2); \
2728         op_scale1(3); \
2729         if(W==4) continue; \
2730         op_scale1(4); \
2731         op_scale1(5); \
2732         op_scale1(6); \
2733         op_scale1(7); \
2734         if(W==8) continue; \
2735         op_scale1(8); \
2736         op_scale1(9); \
2737         op_scale1(10); \
2738         op_scale1(11); \
2739         op_scale1(12); \
2740         op_scale1(13); \
2741         op_scale1(14); \
2742         op_scale1(15); \
2743     } \
2744 } \
2745 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2746     int y; \
2747     offset = ((offset + 1) | 1) << log2_denom; \
2748     for(y=0; y<H; y++, dst += stride, src += stride){ \
2749         op_scale2(0); \
2750         op_scale2(1); \
2751         if(W==2) continue; \
2752         op_scale2(2); \
2753         op_scale2(3); \
2754         if(W==4) continue; \
2755         op_scale2(4); \
2756         op_scale2(5); \
2757         op_scale2(6); \
2758         op_scale2(7); \
2759         if(W==8) continue; \
2760         op_scale2(8); \
2761         op_scale2(9); \
2762         op_scale2(10); \
2763         op_scale2(11); \
2764         op_scale2(12); \
2765         op_scale2(13); \
2766         op_scale2(14); \
2767         op_scale2(15); \
2768     } \
2769 }
2770
2771 H264_WEIGHT(16,16)
2772 H264_WEIGHT(16,8)
2773 H264_WEIGHT(8,16)
2774 H264_WEIGHT(8,8)
2775 H264_WEIGHT(8,4)
2776 H264_WEIGHT(4,8)
2777 H264_WEIGHT(4,4)
2778 H264_WEIGHT(4,2)
2779 H264_WEIGHT(2,4)
2780 H264_WEIGHT(2,2)
2781
2782 #undef op_scale1
2783 #undef op_scale2
2784 #undef H264_WEIGHT
2785
2786 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2787     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2788     int i;
2789
2790     for(i=0; i<h; i++){
2791         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2792         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2793         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2794         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2795         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2796         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2797         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2798         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2799         dst+=dstStride;
2800         src+=srcStride;
2801     }
2802 }
2803
2804 #if CONFIG_CAVS_DECODER
2805 /* AVS specific */
2806 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2807
2808 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2809     put_pixels8_c(dst, src, stride, 8);
2810 }
2811 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2812     avg_pixels8_c(dst, src, stride, 8);
2813 }
2814 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2815     put_pixels16_c(dst, src, stride, 16);
2816 }
2817 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2818     avg_pixels16_c(dst, src, stride, 16);
2819 }
2820 #endif /* CONFIG_CAVS_DECODER */
2821
2822 void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
2823
2824 #if CONFIG_VC1_DECODER
2825 /* VC-1 specific */
2826 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2827
2828 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2829     put_pixels8_c(dst, src, stride, 8);
2830 }
2831 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2832     avg_pixels8_c(dst, src, stride, 8);
2833 }
2834 #endif /* CONFIG_VC1_DECODER */
2835
2836 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2837
2838 /* H264 specific */
2839 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2840
2841 #if CONFIG_RV30_DECODER
2842 void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2843 #endif /* CONFIG_RV30_DECODER */
2844
2845 #if CONFIG_RV40_DECODER
2846 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2847     put_pixels16_xy2_c(dst, src, stride, 16);
2848 }
2849 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2850     avg_pixels16_xy2_c(dst, src, stride, 16);
2851 }
2852 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2853     put_pixels8_xy2_c(dst, src, stride, 8);
2854 }
2855 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2856     avg_pixels8_xy2_c(dst, src, stride, 8);
2857 }
2858
2859 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2860 #endif /* CONFIG_RV40_DECODER */
2861
2862 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2863     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2864     int i;
2865
2866     for(i=0; i<w; i++){
2867         const int src_1= src[ -srcStride];
2868         const int src0 = src[0          ];
2869         const int src1 = src[  srcStride];
2870         const int src2 = src[2*srcStride];
2871         const int src3 = src[3*srcStride];
2872         const int src4 = src[4*srcStride];
2873         const int src5 = src[5*srcStride];
2874         const int src6 = src[6*srcStride];
2875         const int src7 = src[7*srcStride];
2876         const int src8 = src[8*srcStride];
2877         const int src9 = src[9*srcStride];
2878         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2879         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2880         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2881         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2882         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2883         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2884         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2885         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2886         src++;
2887         dst++;
2888     }
2889 }
2890
2891 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2892     put_pixels8_c(dst, src, stride, 8);
2893 }
2894
2895 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2896     uint8_t half[64];
2897     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2898     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2899 }
2900
2901 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2902     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2903 }
2904
2905 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2906     uint8_t half[64];
2907     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2908     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2909 }
2910
2911 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2912     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2913 }
2914
2915 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2916     uint8_t halfH[88];
2917     uint8_t halfV[64];
2918     uint8_t halfHV[64];
2919     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2920     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2921     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2922     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2923 }
2924 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2925     uint8_t halfH[88];
2926     uint8_t halfV[64];
2927     uint8_t halfHV[64];
2928     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2929     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2930     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2931     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2932 }
2933 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2934     uint8_t halfH[88];
2935     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2936     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2937 }
2938
2939 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2940     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2941     int x;
2942     const int strength= ff_h263_loop_filter_strength[qscale];
2943
2944     for(x=0; x<8; x++){
2945         int d1, d2, ad1;
2946         int p0= src[x-2*stride];
2947         int p1= src[x-1*stride];
2948         int p2= src[x+0*stride];
2949         int p3= src[x+1*stride];
2950         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2951
2952         if     (d<-2*strength) d1= 0;
2953         else if(d<-  strength) d1=-2*strength - d;
2954         else if(d<   strength) d1= d;
2955         else if(d< 2*strength) d1= 2*strength - d;
2956         else                   d1= 0;
2957
2958         p1 += d1;
2959         p2 -= d1;
2960         if(p1&256) p1= ~(p1>>31);
2961         if(p2&256) p2= ~(p2>>31);
2962
2963         src[x-1*stride] = p1;
2964         src[x+0*stride] = p2;
2965
2966         ad1= FFABS(d1)>>1;
2967
2968         d2= av_clip((p0-p3)/4, -ad1, ad1);
2969
2970         src[x-2*stride] = p0 - d2;
2971         src[x+  stride] = p3 + d2;
2972     }
2973     }
2974 }
2975
2976 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2977     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2978     int y;
2979     const int strength= ff_h263_loop_filter_strength[qscale];
2980
2981     for(y=0; y<8; y++){
2982         int d1, d2, ad1;
2983         int p0= src[y*stride-2];
2984         int p1= src[y*stride-1];
2985         int p2= src[y*stride+0];
2986         int p3= src[y*stride+1];
2987         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2988
2989         if     (d<-2*strength) d1= 0;
2990         else if(d<-  strength) d1=-2*strength - d;
2991         else if(d<   strength) d1= d;
2992         else if(d< 2*strength) d1= 2*strength - d;
2993         else                   d1= 0;
2994
2995         p1 += d1;
2996         p2 -= d1;
2997         if(p1&256) p1= ~(p1>>31);
2998         if(p2&256) p2= ~(p2>>31);
2999
3000         src[y*stride-1] = p1;
3001         src[y*stride+0] = p2;
3002
3003         ad1= FFABS(d1)>>1;
3004
3005         d2= av_clip((p0-p3)/4, -ad1, ad1);
3006
3007         src[y*stride-2] = p0 - d2;
3008         src[y*stride+1] = p3 + d2;
3009     }
3010     }
3011 }
3012
3013 static void h261_loop_filter_c(uint8_t *src, int stride){
3014     int x,y,xy,yz;
3015     int temp[64];
3016
3017     for(x=0; x<8; x++){
3018         temp[x      ] = 4*src[x           ];
3019         temp[x + 7*8] = 4*src[x + 7*stride];
3020     }
3021     for(y=1; y<7; y++){
3022         for(x=0; x<8; x++){
3023             xy = y * stride + x;
3024             yz = y * 8 + x;
3025             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
3026         }
3027     }
3028
3029     for(y=0; y<8; y++){
3030         src[  y*stride] = (temp[  y*8] + 2)>>2;
3031         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
3032         for(x=1; x<7; x++){
3033             xy = y * stride + x;
3034             yz = y * 8 + x;
3035             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
3036         }
3037     }
3038 }
3039
3040 static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3041 {
3042     int i, d;
3043     for( i = 0; i < 4; i++ ) {
3044         if( tc0[i] < 0 ) {
3045             pix += 4*ystride;
3046             continue;
3047         }
3048         for( d = 0; d < 4; d++ ) {
3049             const int p0 = pix[-1*xstride];
3050             const int p1 = pix[-2*xstride];
3051             const int p2 = pix[-3*xstride];
3052             const int q0 = pix[0];
3053             const int q1 = pix[1*xstride];
3054             const int q2 = pix[2*xstride];
3055
3056             if( FFABS( p0 - q0 ) < alpha &&
3057                 FFABS( p1 - p0 ) < beta &&
3058                 FFABS( q1 - q0 ) < beta ) {
3059
3060                 int tc = tc0[i];
3061                 int i_delta;
3062
3063                 if( FFABS( p2 - p0 ) < beta ) {
3064                     if(tc0[i])
3065                     pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3066                     tc++;
3067                 }
3068                 if( FFABS( q2 - q0 ) < beta ) {
3069                     if(tc0[i])
3070                     pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3071                     tc++;
3072                 }
3073
3074                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3075                 pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
3076                 pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
3077             }
3078             pix += ystride;
3079         }
3080     }
3081 }
3082 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3083 {
3084     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3085 }
3086 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3087 {
3088     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3089 }
3090
3091 static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3092 {
3093     int d;
3094     for( d = 0; d < 16; d++ ) {
3095         const int p2 = pix[-3*xstride];
3096         const int p1 = pix[-2*xstride];
3097         const int p0 = pix[-1*xstride];
3098
3099         const int q0 = pix[ 0*xstride];
3100         const int q1 = pix[ 1*xstride];
3101         const int q2 = pix[ 2*xstride];
3102
3103         if( FFABS( p0 - q0 ) < alpha &&
3104             FFABS( p1 - p0 ) < beta &&
3105             FFABS( q1 - q0 ) < beta ) {
3106
3107             if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3108                 if( FFABS( p2 - p0 ) < beta)
3109                 {
3110                     const int p3 = pix[-4*xstride];
3111                     /* p0', p1', p2' */
3112                     pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3113                     pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3114                     pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3115                 } else {
3116                     /* p0' */
3117                     pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3118                 }
3119                 if( FFABS( q2 - q0 ) < beta)
3120                 {
3121                     const int q3 = pix[3*xstride];
3122                     /* q0', q1', q2' */
3123                     pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3124                     pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3125                     pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3126                 } else {
3127                     /* q0' */
3128                     pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3129                 }
3130             }else{
3131                 /* p0', q0' */
3132                 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3133                 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3134             }
3135         }
3136         pix += ystride;
3137     }
3138 }
3139 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3140 {
3141     h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3142 }
3143 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3144 {
3145     h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3146 }
3147
3148 static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3149 {
3150     int i, d;
3151     for( i = 0; i < 4; i++ ) {
3152         const int tc = tc0[i];
3153         if( tc <= 0 ) {
3154             pix += 2*ystride;
3155             continue;
3156         }
3157         for( d = 0; d < 2; d++ ) {
3158             const int p0 = pix[-1*xstride];
3159             const int p1 = pix[-2*xstride];
3160             const int q0 = pix[0];
3161             const int q1 = pix[1*xstride];
3162
3163             if( FFABS( p0 - q0 ) < alpha &&
3164                 FFABS( p1 - p0 ) < beta &&
3165                 FFABS( q1 - q0 ) < beta ) {
3166
3167                 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3168
3169                 pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3170                 pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3171             }
3172             pix += ystride;
3173         }
3174     }
3175 }
3176 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3177 {
3178     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3179 }
3180 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3181 {
3182     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3183 }
3184
3185 static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3186 {
3187     int d;
3188     for( d = 0; d < 8; d++ ) {
3189         const int p0 = pix[-1*xstride];
3190         const int p1 = pix[-2*xstride];
3191         const int q0 = pix[0];
3192         const int q1 = pix[1*xstride];
3193
3194         if( FFABS( p0 - q0 ) < alpha &&
3195             FFABS( p1 - p0 ) < beta &&
3196             FFABS( q1 - q0 ) < beta ) {
3197
3198             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3199             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3200         }
3201         pix += ystride;
3202     }
3203 }
3204 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3205 {
3206     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3207 }
3208 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3209 {
3210     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3211 }
3212
3213 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3214 {
3215     int s, i;
3216
3217     s = 0;
3218     for(i=0;i<h;i++) {
3219         s += abs(pix1[0] - pix2[0]);
3220         s += abs(pix1[1] - pix2[1]);
3221         s += abs(pix1[2] - pix2[2]);
3222         s += abs(pix1[3] - pix2[3]);
3223         s += abs(pix1[4] - pix2[4]);
3224         s += abs(pix1[5] - pix2[5]);
3225         s += abs(pix1[6] - pix2[6]);
3226         s += abs(pix1[7] - pix2[7]);
3227         s += abs(pix1[8] - pix2[8]);
3228         s += abs(pix1[9] - pix2[9]);
3229         s += abs(pix1[10] - pix2[10]);
3230         s += abs(pix1[11] - pix2[11]);
3231         s += abs(pix1[12] - pix2[12]);
3232         s += abs(pix1[13] - pix2[13]);
3233         s += abs(pix1[14] - pix2[14]);
3234         s += abs(pix1[15] - pix2[15]);
3235         pix1 += line_size;
3236         pix2 += line_size;
3237     }
3238     return s;
3239 }
3240
3241 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3242 {
3243     int s, i;
3244
3245     s = 0;
3246     for(i=0;i<h;i++) {
3247         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3248         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3249         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3250         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3251         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3252         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3253         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3254         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3255         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3256         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3257         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3258         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3259         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3260         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3261         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3262         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3263         pix1 += line_size;
3264         pix2 += line_size;
3265     }
3266     return s;
3267 }
3268
3269 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3270 {
3271     int s, i;
3272     uint8_t *pix3 = pix2 + line_size;
3273
3274     s = 0;
3275     for(i=0;i<h;i++) {
3276         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3277         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3278         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3279         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3280         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3281         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3282         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3283         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3284         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3285         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3286         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3287         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3288         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3289         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3290         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3291         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3292         pix1 += line_size;
3293         pix2 += line_size;
3294         pix3 += line_size;
3295     }
3296     return s;
3297 }
3298
3299 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3300 {
3301     int s, i;
3302     uint8_t *pix3 = pix2 + line_size;
3303
3304     s = 0;
3305     for(i=0;i<h;i++) {
3306         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3307         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3308         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3309         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3310         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3311         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3312         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3313         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3314         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3315         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3316         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3317         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3318         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3319         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3320         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3321         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3322         pix1 += line_size;
3323         pix2 += line_size;
3324         pix3 += line_size;
3325     }
3326     return s;
3327 }
3328
3329 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3330 {
3331     int s, i;
3332
3333     s = 0;
3334     for(i=0;i<h;i++) {
3335         s += abs(pix1[0] - pix2[0]);
3336         s += abs(pix1[1] - pix2[1]);
3337         s += abs(pix1[2] - pix2[2]);
3338         s += abs(pix1[3] - pix2[3]);
3339         s += abs(pix1[4] - pix2[4]);
3340         s += abs(pix1[5] - pix2[5]);
3341         s += abs(pix1[6] - pix2[6]);
3342         s += abs(pix1[7] - pix2[7]);
3343         pix1 += line_size;
3344         pix2 += line_size;
3345     }
3346     return s;
3347 }
3348
3349 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3350 {
3351     int s, i;
3352
3353     s = 0;
3354     for(i=0;i<h;i++) {
3355         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3356         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3357         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3358         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3359         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3360         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3361         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3362         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3363         pix1 += line_size;
3364         pix2 += line_size;
3365     }
3366     return s;
3367 }
3368
3369 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3370 {
3371     int s, i;
3372     uint8_t *pix3 = pix2 + line_size;
3373
3374     s = 0;
3375     for(i=0;i<h;i++) {
3376         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3377         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3378         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3379         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3380         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3381         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3382         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3383         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3384         pix1 += line_size;
3385         pix2 += line_size;
3386         pix3 += line_size;
3387     }
3388     return s;
3389 }
3390
3391 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3392 {
3393     int s, i;
3394     uint8_t *pix3 = pix2 + line_size;
3395
3396     s = 0;
3397     for(i=0;i<h;i++) {
3398         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3399         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3400         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3401         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3402         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3403         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3404         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3405         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3406         pix1 += line_size;
3407         pix2 += line_size;
3408         pix3 += line_size;
3409     }
3410     return s;
3411 }
3412
3413 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3414     MpegEncContext *c = v;
3415     int score1=0;
3416     int score2=0;
3417     int x,y;
3418
3419     for(y=0; y<h; y++){
3420         for(x=0; x<16; x++){
3421             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3422         }
3423         if(y+1<h){
3424             for(x=0; x<15; x++){
3425                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3426                              - s1[x+1] + s1[x+1+stride])
3427                         -FFABS(  s2[x  ] - s2[x  +stride]
3428                              - s2[x+1] + s2[x+1+stride]);
3429             }
3430         }
3431         s1+= stride;
3432         s2+= stride;
3433     }
3434
3435     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3436     else  return score1 + FFABS(score2)*8;
3437 }
3438
3439 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3440     MpegEncContext *c = v;
3441     int score1=0;
3442     int score2=0;
3443     int x,y;
3444
3445     for(y=0; y<h; y++){
3446         for(x=0; x<8; x++){
3447             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3448         }
3449         if(y+1<h){
3450             for(x=0; x<7; x++){
3451                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3452                              - s1[x+1] + s1[x+1+stride])
3453                         -FFABS(  s2[x  ] - s2[x  +stride]
3454                              - s2[x+1] + s2[x+1+stride]);
3455             }
3456         }
3457         s1+= stride;
3458         s2+= stride;
3459     }
3460
3461     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3462     else  return score1 + FFABS(score2)*8;
3463 }
3464
3465 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3466     int i;
3467     unsigned int sum=0;
3468
3469     for(i=0; i<8*8; i++){
3470         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3471         int w= weight[i];
3472         b>>= RECON_SHIFT;
3473         assert(-512<b && b<512);
3474
3475         sum += (w*b)*(w*b)>>4;
3476     }
3477     return sum>>2;
3478 }
3479
3480 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3481     int i;
3482
3483     for(i=0; i<8*8; i++){
3484         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3485     }
3486 }
3487
3488 /**
3489  * permutes an 8x8 block.
3490  * @param block the block which will be permuted according to the given permutation vector
3491  * @param permutation the permutation vector
3492  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3493  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3494  *                  (inverse) permutated to scantable order!
3495  */
3496 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3497 {
3498     int i;
3499     DCTELEM temp[64];
3500
3501     if(last<=0) return;
3502     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3503
3504     for(i=0; i<=last; i++){
3505         const int j= scantable[i];
3506         temp[j]= block[j];
3507         block[j]=0;
3508     }
3509
3510     for(i=0; i<=last; i++){
3511         const int j= scantable[i];
3512         const int perm_j= permutation[j];
3513         block[perm_j]= temp[j];
3514     }
3515 }
3516
3517 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3518     return 0;
3519 }
3520
3521 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3522     int i;
3523
3524     memset(cmp, 0, sizeof(void*)*6);
3525
3526     for(i=0; i<6; i++){
3527         switch(type&0xFF){
3528         case FF_CMP_SAD:
3529             cmp[i]= c->sad[i];
3530             break;
3531         case FF_CMP_SATD:
3532             cmp[i]= c->hadamard8_diff[i];
3533             break;
3534         case FF_CMP_SSE:
3535             cmp[i]= c->sse[i];
3536             break;
3537         case FF_CMP_DCT:
3538             cmp[i]= c->dct_sad[i];
3539             break;
3540         case FF_CMP_DCT264:
3541             cmp[i]= c->dct264_sad[i];
3542             break;
3543         case FF_CMP_DCTMAX:
3544             cmp[i]= c->dct_max[i];
3545             break;
3546         case FF_CMP_PSNR:
3547             cmp[i]= c->quant_psnr[i];
3548             break;
3549         case FF_CMP_BIT:
3550             cmp[i]= c->bit[i];
3551             break;
3552         case FF_CMP_RD:
3553             cmp[i]= c->rd[i];
3554             break;
3555         case FF_CMP_VSAD:
3556             cmp[i]= c->vsad[i];
3557             break;
3558         case FF_CMP_VSSE:
3559             cmp[i]= c->vsse[i];
3560             break;
3561         case FF_CMP_ZERO:
3562             cmp[i]= zero_cmp;
3563             break;
3564         case FF_CMP_NSSE:
3565             cmp[i]= c->nsse[i];
3566             break;
3567 #if CONFIG_SNOW_ENCODER
3568         case FF_CMP_W53:
3569             cmp[i]= c->w53[i];
3570             break;
3571         case FF_CMP_W97:
3572             cmp[i]= c->w97[i];
3573             break;
3574 #endif
3575         default:
3576             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3577         }
3578     }
3579 }
3580
3581 static void clear_block_c(DCTELEM *block)
3582 {
3583     memset(block, 0, sizeof(DCTELEM)*64);
3584 }
3585
3586 /**
3587  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3588  */
3589 static void clear_blocks_c(DCTELEM *blocks)
3590 {
3591     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3592 }
3593
3594 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3595     long i;
3596     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3597         long a = *(long*)(src+i);
3598         long b = *(long*)(dst+i);
3599         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3600     }
3601     for(; i<w; i++)
3602         dst[i+0] += src[i+0];
3603 }
3604
3605 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3606     long i;
3607     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3608         long a = *(long*)(src1+i);
3609         long b = *(long*)(src2+i);
3610         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3611     }
3612     for(; i<w; i++)
3613         dst[i] = src1[i]+src2[i];
3614 }
3615
3616 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3617     long i;
3618 #if !HAVE_FAST_UNALIGNED
3619     if((long)src2 & (sizeof(long)-1)){
3620         for(i=0; i+7<w; i+=8){
3621             dst[i+0] = src1[i+0]-src2[i+0];
3622             dst[i+1] = src1[i+1]-src2[i+1];
3623             dst[i+2] = src1[i+2]-src2[i+2];
3624             dst[i+3] = src1[i+3]-src2[i+3];
3625             dst[i+4] = src1[i+4]-src2[i+4];
3626             dst[i+5] = src1[i+5]-src2[i+5];
3627             dst[i+6] = src1[i+6]-src2[i+6];
3628             dst[i+7] = src1[i+7]-src2[i+7];
3629         }
3630     }else
3631 #endif
3632     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3633         long a = *(long*)(src1+i);
3634         long b = *(long*)(src2+i);
3635         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3636     }
3637     for(; i<w; i++)
3638         dst[i+0] = src1[i+0]-src2[i+0];
3639 }
3640
3641 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3642     int i;
3643     uint8_t l, lt;
3644
3645     l= *left;
3646     lt= *left_top;
3647
3648     for(i=0; i<w; i++){
3649         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3650         lt= src1[i];
3651         dst[i]= l;
3652     }
3653
3654     *left= l;
3655     *left_top= lt;
3656 }
3657
3658 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3659     int i;
3660     uint8_t l, lt;
3661
3662     l= *left;
3663     lt= *left_top;
3664
3665     for(i=0; i<w; i++){
3666         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3667         lt= src1[i];
3668         l= src2[i];
3669         dst[i]= l - pred;
3670     }
3671
3672     *left= l;
3673     *left_top= lt;
3674 }
3675
3676 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3677     int i;
3678
3679     for(i=0; i<w-1; i++){
3680         acc+= src[i];
3681         dst[i]= acc;
3682         i++;
3683         acc+= src[i];
3684         dst[i]= acc;
3685     }
3686
3687     for(; i<w; i++){
3688         acc+= src[i];
3689         dst[i]= acc;
3690     }
3691
3692     return acc;
3693 }
3694
3695 #if HAVE_BIGENDIAN
3696 #define B 3
3697 #define G 2
3698 #define R 1
3699 #define A 0
3700 #else
3701 #define B 0
3702 #define G 1
3703 #define R 2
3704 #define A 3
3705 #endif
3706 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3707     int i;
3708     int r,g,b,a;
3709     r= *red;
3710     g= *green;
3711     b= *blue;
3712     a= *alpha;
3713
3714     for(i=0; i<w; i++){
3715         b+= src[4*i+B];
3716         g+= src[4*i+G];
3717         r+= src[4*i+R];
3718         a+= src[4*i+A];
3719
3720         dst[4*i+B]= b;
3721         dst[4*i+G]= g;
3722         dst[4*i+R]= r;
3723         dst[4*i+A]= a;
3724     }
3725
3726     *red= r;
3727     *green= g;
3728     *blue= b;
3729     *alpha= a;
3730 }
3731 #undef B
3732 #undef G
3733 #undef R
3734 #undef A
3735
3736 #define BUTTERFLY2(o1,o2,i1,i2) \
3737 o1= (i1)+(i2);\
3738 o2= (i1)-(i2);
3739
3740 #define BUTTERFLY1(x,y) \
3741 {\
3742     int a,b;\
3743     a= x;\
3744     b= y;\
3745     x= a+b;\
3746     y= a-b;\
3747 }
3748
3749 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3750
3751 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3752     int i;
3753     int temp[64];
3754     int sum=0;
3755
3756     assert(h==8);
3757
3758     for(i=0; i<8; i++){
3759         //FIXME try pointer walks
3760         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3761         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3762         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3763         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3764
3765         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3766         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3767         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3768         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3769
3770         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3771         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3772         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3773         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3774     }
3775
3776     for(i=0; i<8; i++){
3777         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3778         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3779         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3780         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3781
3782         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3783         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3784         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3785         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3786
3787         sum +=
3788              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3789             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3790             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3791             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3792     }
3793 #if 0
3794 static int maxi=0;
3795 if(sum>maxi){
3796     maxi=sum;
3797     printf("MAX:%d\n", maxi);
3798 }
3799 #endif
3800     return sum;
3801 }
3802
3803 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3804     int i;
3805     int temp[64];
3806     int sum=0;
3807
3808     assert(h==8);
3809
3810     for(i=0; i<8; i++){
3811         //FIXME try pointer walks
3812         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3813         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3814         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3815         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3816
3817         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3818         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3819         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3820         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3821
3822         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3823         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3824         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3825         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3826     }
3827
3828     for(i=0; i<8; i++){
3829         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3830         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3831         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3832         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3833
3834         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3835         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3836         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3837         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3838
3839         sum +=
3840              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3841             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3842             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3843             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3844     }
3845
3846     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3847
3848     return sum;
3849 }
3850
3851 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3852     MpegEncContext * const s= (MpegEncContext *)c;
3853     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3854
3855     assert(h==8);
3856
3857     s->dsp.diff_pixels(temp, src1, src2, stride);
3858     s->dsp.fdct(temp);
3859     return s->dsp.sum_abs_dctelem(temp);
3860 }
3861
3862 #if CONFIG_GPL
3863 #define DCT8_1D {\
3864     const int s07 = SRC(0) + SRC(7);\
3865     const int s16 = SRC(1) + SRC(6);\
3866     const int s25 = SRC(2) + SRC(5);\
3867     const int s34 = SRC(3) + SRC(4);\
3868     const int a0 = s07 + s34;\
3869     const int a1 = s16 + s25;\
3870     const int a2 = s07 - s34;\
3871     const int a3 = s16 - s25;\
3872     const int d07 = SRC(0) - SRC(7);\
3873     const int d16 = SRC(1) - SRC(6);\
3874     const int d25 = SRC(2) - SRC(5);\
3875     const int d34 = SRC(3) - SRC(4);\
3876     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3877     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3878     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3879     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3880     DST(0,  a0 + a1     ) ;\
3881     DST(1,  a4 + (a7>>2)) ;\
3882     DST(2,  a2 + (a3>>1)) ;\
3883     DST(3,  a5 + (a6>>2)) ;\
3884     DST(4,  a0 - a1     ) ;\
3885     DST(5,  a6 - (a5>>2)) ;\
3886     DST(6, (a2>>1) - a3 ) ;\
3887     DST(7, (a4>>2) - a7 ) ;\
3888 }
3889
3890 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3891     MpegEncContext * const s= (MpegEncContext *)c;
3892     DCTELEM dct[8][8];
3893     int i;
3894     int sum=0;
3895
3896     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3897
3898 #define SRC(x) dct[i][x]
3899 #define DST(x,v) dct[i][x]= v
3900     for( i = 0; i < 8; i++ )
3901         DCT8_1D
3902 #undef SRC
3903 #undef DST
3904
3905 #define SRC(x) dct[x][i]
3906 #define DST(x,v) sum += FFABS(v)
3907     for( i = 0; i < 8; i++ )
3908         DCT8_1D
3909 #undef SRC
3910 #undef DST
3911     return sum;
3912 }
3913 #endif
3914
3915 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3916     MpegEncContext * const s= (MpegEncContext *)c;
3917     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3918     int sum=0, i;
3919
3920     assert(h==8);
3921
3922     s->dsp.diff_pixels(temp, src1, src2, stride);
3923     s->dsp.fdct(temp);
3924
3925     for(i=0; i<64; i++)
3926         sum= FFMAX(sum, FFABS(temp[i]));
3927
3928     return sum;
3929 }
3930
3931 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3932     MpegEncContext * const s= (MpegEncContext *)c;
3933     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3934     DCTELEM * const bak = temp+64;
3935     int sum=0, i;
3936
3937     assert(h==8);
3938     s->mb_intra=0;
3939
3940     s->dsp.diff_pixels(temp, src1, src2, stride);
3941
3942     memcpy(bak, temp, 64*sizeof(DCTELEM));
3943
3944     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3945     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3946     ff_simple_idct(temp); //FIXME
3947
3948     for(i=0; i<64; i++)
3949         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3950
3951     return sum;
3952 }
3953
3954 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3955     MpegEncContext * const s= (MpegEncContext *)c;
3956     const uint8_t *scantable= s->intra_scantable.permutated;
3957     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3958     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3959     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3960     int i, last, run, bits, level, distortion, start_i;
3961     const int esc_length= s->ac_esc_length;
3962     uint8_t * length;
3963     uint8_t * last_length;
3964
3965     assert(h==8);
3966
3967     copy_block8(lsrc1, src1, 8, stride, 8);
3968     copy_block8(lsrc2, src2, 8, stride, 8);
3969
3970     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3971
3972     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3973
3974     bits=0;
3975
3976     if (s->mb_intra) {
3977         start_i = 1;
3978         length     = s->intra_ac_vlc_length;
3979         last_length= s->intra_ac_vlc_last_length;
3980         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3981     } else {
3982         start_i = 0;
3983         length     = s->inter_ac_vlc_length;
3984         last_length= s->inter_ac_vlc_last_length;
3985     }
3986
3987     if(last>=start_i){
3988         run=0;
3989         for(i=start_i; i<last; i++){
3990             int j= scantable[i];
3991             level= temp[j];
3992
3993             if(level){
3994                 level+=64;
3995                 if((level&(~127)) == 0){
3996                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3997                 }else
3998                     bits+= esc_length;
3999                 run=0;
4000             }else
4001                 run++;
4002         }
4003         i= scantable[last];
4004
4005         level= temp[i] + 64;
4006
4007         assert(level - 64);
4008
4009         if((level&(~127)) == 0){
4010             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
4011         }else
4012             bits+= esc_length;
4013
4014     }
4015
4016     if(last>=0){
4017         if(s->mb_intra)
4018             s->dct_unquantize_intra(s, temp, 0, s->qscale);
4019         else
4020             s->dct_unquantize_inter(s, temp, 0, s->qscale);
4021     }
4022
4023     s->dsp.idct_add(lsrc2, 8, temp);
4024
4025     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
4026
4027     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
4028 }
4029
4030 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
4031     MpegEncContext * const s= (MpegEncContext *)c;
4032     const uint8_t *scantable= s->intra_scantable.permutated;
4033     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
4034     int i, last, run, bits, level, start_i;
4035     const int esc_length= s->ac_esc_length;
4036     uint8_t * length;
4037     uint8_t * last_length;
4038
4039     assert(h==8);
4040
4041     s->dsp.diff_pixels(temp, src1, src2, stride);
4042
4043     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
4044
4045     bits=0;
4046
4047     if (s->mb_intra) {
4048         start_i = 1;
4049         length     = s->intra_ac_vlc_length;
4050         last_length= s->intra_ac_vlc_last_length;
4051         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
4052     } else {
4053         start_i = 0;
4054         length     = s->inter_ac_vlc_length;
4055         last_length= s->inter_ac_vlc_last_length;
4056     }
4057
4058     if(last>=start_i){
4059         run=0;
4060         for(i=start_i; i<last; i++){
4061             int j= scantable[i];
4062             level= temp[j];
4063
4064             if(level){
4065                 level+=64;
4066                 if((level&(~127)) == 0){
4067                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
4068                 }else
4069                     bits+= esc_length;
4070                 run=0;
4071             }else
4072                 run++;
4073         }
4074         i= scantable[last];
4075
4076         level= temp[i] + 64;
4077
4078         assert(level - 64);
4079
4080         if((level&(~127)) == 0){
4081             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
4082         }else
4083             bits+= esc_length;
4084     }
4085
4086     return bits;
4087 }
4088
4089 #define VSAD_INTRA(size) \
4090 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4091     int score=0;                                                                                            \
4092     int x,y;                                                                                                \
4093                                                                                                             \
4094     for(y=1; y<h; y++){                                                                                     \
4095         for(x=0; x<size; x+=4){                                                                             \
4096             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
4097                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
4098         }                                                                                                   \
4099         s+= stride;                                                                                         \
4100     }                                                                                                       \
4101                                                                                                             \
4102     return score;                                                                                           \
4103 }
4104 VSAD_INTRA(8)
4105 VSAD_INTRA(16)
4106
4107 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4108     int score=0;
4109     int x,y;
4110
4111     for(y=1; y<h; y++){
4112         for(x=0; x<16; x++){
4113             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4114         }
4115         s1+= stride;
4116         s2+= stride;
4117     }
4118
4119     return score;
4120 }
4121
4122 #define SQ(a) ((a)*(a))
4123 #define VSSE_INTRA(size) \
4124 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4125     int score=0;                                                                                            \
4126     int x,y;                                                                                                \
4127                                                                                                             \
4128     for(y=1; y<h; y++){                                                                                     \
4129         for(x=0; x<size; x+=4){                                                                               \
4130             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
4131                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
4132         }                                                                                                   \
4133         s+= stride;                                                                                         \
4134     }                                                                                                       \
4135                                                                                                             \
4136     return score;                                                                                           \
4137 }
4138 VSSE_INTRA(8)
4139 VSSE_INTRA(16)
4140
4141 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4142     int score=0;
4143     int x,y;
4144
4145     for(y=1; y<h; y++){
4146         for(x=0; x<16; x++){
4147             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4148         }
4149         s1+= stride;
4150         s2+= stride;
4151     }
4152
4153     return score;
4154 }
4155
4156 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4157                                int size){
4158     int score=0;
4159     int i;
4160     for(i=0; i<size; i++)
4161         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4162     return score;
4163 }
4164
4165 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4166 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4167 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4168 #if CONFIG_GPL
4169 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4170 #endif
4171 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4172 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4173 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4174 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4175
4176 static void vector_fmul_c(float *dst, const float *src, int len){
4177     int i;
4178     for(i=0; i<len; i++)
4179         dst[i] *= src[i];
4180 }
4181
4182 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4183     int i;
4184     src1 += len-1;
4185     for(i=0; i<len; i++)
4186         dst[i] = src0[i] * src1[-i];
4187 }
4188
4189 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
4190     int i;
4191     for(i=0; i<len; i++)
4192         dst[i] = src0[i] * src1[i] + src2[i];
4193 }
4194
4195 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4196     int i,j;
4197     dst += len;
4198     win += len;
4199     src0+= len;
4200     for(i=-len, j=len-1; i<0; i++, j--) {
4201         float s0 = src0[i];
4202         float s1 = src1[j];
4203         float wi = win[i];
4204         float wj = win[j];
4205         dst[i] = s0*wj - s1*wi + add_bias;
4206         dst[j] = s0*wi + s1*wj + add_bias;
4207     }
4208 }
4209
4210 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
4211                                  int len)
4212 {
4213     int i;
4214     for (i = 0; i < len; i++)
4215         dst[i] = src[i] * mul;
4216 }
4217
4218 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
4219                                       const float **sv, float mul, int len)
4220 {
4221     int i;
4222     for (i = 0; i < len; i += 2, sv++) {
4223         dst[i  ] = src[i  ] * sv[0][0] * mul;
4224         dst[i+1] = src[i+1] * sv[0][1] * mul;
4225     }
4226 }
4227
4228 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
4229                                       const float **sv, float mul, int len)
4230 {
4231     int i;
4232     for (i = 0; i < len; i += 4, sv++) {
4233         dst[i  ] = src[i  ] * sv[0][0] * mul;
4234         dst[i+1] = src[i+1] * sv[0][1] * mul;
4235         dst[i+2] = src[i+2] * sv[0][2] * mul;
4236         dst[i+3] = src[i+3] * sv[0][3] * mul;
4237     }
4238 }
4239
4240 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
4241                                int len)
4242 {
4243     int i;
4244     for (i = 0; i < len; i += 2, sv++) {
4245         dst[i  ] = sv[0][0] * mul;
4246         dst[i+1] = sv[0][1] * mul;
4247     }
4248 }
4249
4250 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
4251                                int len)
4252 {
4253     int i;
4254     for (i = 0; i < len; i += 4, sv++) {
4255         dst[i  ] = sv[0][0] * mul;
4256         dst[i+1] = sv[0][1] * mul;
4257         dst[i+2] = sv[0][2] * mul;
4258         dst[i+3] = sv[0][3] * mul;
4259     }
4260 }
4261
4262 static void butterflies_float_c(float *restrict v1, float *restrict v2,
4263                                 int len)
4264 {
4265     int i;
4266     for (i = 0; i < len; i++) {
4267         float t = v1[i] - v2[i];
4268         v1[i] += v2[i];
4269         v2[i] = t;
4270     }
4271 }
4272
4273 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
4274 {
4275     float p = 0.0;
4276     int i;
4277
4278     for (i = 0; i < len; i++)
4279         p += v1[i] * v2[i];
4280
4281     return p;
4282 }
4283
4284 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4285     int i;
4286     for(i=0; i<len; i++)
4287         dst[i] = src[i] * mul;
4288 }
4289
4290 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
4291                    uint32_t maxi, uint32_t maxisign)
4292 {
4293
4294     if(a > mini) return mini;
4295     else if((a^(1<<31)) > maxisign) return maxi;
4296     else return a;
4297 }
4298
4299 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
4300     int i;
4301     uint32_t mini = *(uint32_t*)min;
4302     uint32_t maxi = *(uint32_t*)max;
4303     uint32_t maxisign = maxi ^ (1<<31);
4304     uint32_t *dsti = (uint32_t*)dst;
4305     const uint32_t *srci = (const uint32_t*)src;
4306     for(i=0; i<len; i+=8) {
4307         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
4308         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
4309         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
4310         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
4311         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
4312         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
4313         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
4314         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
4315     }
4316 }
4317 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
4318     int i;
4319     if(min < 0 && max > 0) {
4320         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
4321     } else {
4322         for(i=0; i < len; i+=8) {
4323             dst[i    ] = av_clipf(src[i    ], min, max);
4324             dst[i + 1] = av_clipf(src[i + 1], min, max);
4325             dst[i + 2] = av_clipf(src[i + 2], min, max);
4326             dst[i + 3] = av_clipf(src[i + 3], min, max);
4327             dst[i + 4] = av_clipf(src[i + 4], min, max);
4328             dst[i + 5] = av_clipf(src[i + 5], min, max);
4329             dst[i + 6] = av_clipf(src[i + 6], min, max);
4330             dst[i + 7] = av_clipf(src[i + 7], min, max);
4331         }
4332     }
4333 }
4334
4335 static av_always_inline int float_to_int16_one(const float *src){
4336     int_fast32_t tmp = *(const int32_t*)src;
4337     if(tmp & 0xf0000){
4338         tmp = (0x43c0ffff - tmp)>>31;
4339         // is this faster on some gcc/cpu combinations?
4340 //      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4341 //      else                 tmp = 0;
4342     }
4343     return tmp - 0x8000;
4344 }
4345
4346 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4347     int i;
4348     for(i=0; i<len; i++)
4349         dst[i] = float_to_int16_one(src+i);
4350 }
4351
4352 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4353     int i,j,c;
4354     if(channels==2){
4355         for(i=0; i<len; i++){
4356             dst[2*i]   = float_to_int16_one(src[0]+i);
4357             dst[2*i+1] = float_to_int16_one(src[1]+i);
4358         }
4359     }else{
4360         for(c=0; c<channels; c++)
4361             for(i=0, j=c; i<len; i++, j+=channels)
4362                 dst[j] = float_to_int16_one(src[c]+i);
4363     }
4364 }
4365
4366 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4367 {
4368     int res = 0;
4369
4370     while (order--)
4371         res += (*v1++ * *v2++) >> shift;
4372
4373     return res;
4374 }
4375
4376 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
4377 {
4378     int res = 0;
4379     while (order--) {
4380         res   += *v1 * *v2++;
4381         *v1++ += mul * *v3++;
4382     }
4383     return res;
4384 }
4385
4386 #define W0 2048
4387 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4388 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4389 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4390 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4391 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4392 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4393 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4394
4395 static void wmv2_idct_row(short * b)
4396 {
4397     int s1,s2;
4398     int a0,a1,a2,a3,a4,a5,a6,a7;
4399     /*step 1*/
4400     a1 = W1*b[1]+W7*b[7];
4401     a7 = W7*b[1]-W1*b[7];
4402     a5 = W5*b[5]+W3*b[3];
4403     a3 = W3*b[5]-W5*b[3];
4404     a2 = W2*b[2]+W6*b[6];
4405     a6 = W6*b[2]-W2*b[6];
4406     a0 = W0*b[0]+W0*b[4];
4407     a4 = W0*b[0]-W0*b[4];
4408     /*step 2*/
4409     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4410     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4411     /*step 3*/
4412     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4413     b[1] = (a4+a6 +s1   + (1<<7))>>8;
4414     b[2] = (a4-a6 +s2   + (1<<7))>>8;
4415     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4416     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4417     b[5] = (a4-a6 -s2   + (1<<7))>>8;
4418     b[6] = (a4+a6 -s1   + (1<<7))>>8;
4419     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4420 }
4421 static void wmv2_idct_col(short * b)
4422 {
4423     int s1,s2;
4424     int a0,a1,a2,a3,a4,a5,a6,a7;
4425     /*step 1, with extended precision*/
4426     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4427     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4428     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4429     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4430     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4431     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4432     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4433     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4434     /*step 2*/
4435     s1 = (181*(a1-a5+a7-a3)+128)>>8;
4436     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4437     /*step 3*/
4438     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4439     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4440     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4441     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4442
4443     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4444     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4445     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4446     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4447 }
4448 void ff_wmv2_idct_c(short * block){
4449     int i;
4450
4451     for(i=0;i<64;i+=8){
4452         wmv2_idct_row(block+i);
4453     }
4454     for(i=0;i<8;i++){
4455         wmv2_idct_col(block+i);
4456     }
4457 }
4458 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4459  converted */
4460 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4461 {
4462     ff_wmv2_idct_c(block);
4463     put_pixels_clamped_c(block, dest, line_size);
4464 }
4465 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4466 {
4467     ff_wmv2_idct_c(block);
4468     add_pixels_clamped_c(block, dest, line_size);
4469 }
4470 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4471 {
4472     j_rev_dct (block);
4473     put_pixels_clamped_c(block, dest, line_size);
4474 }
4475 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4476 {
4477     j_rev_dct (block);
4478     add_pixels_clamped_c(block, dest, line_size);
4479 }
4480
4481 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4482 {
4483     j_rev_dct4 (block);
4484     put_pixels_clamped4_c(block, dest, line_size);
4485 }
4486 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4487 {
4488     j_rev_dct4 (block);
4489     add_pixels_clamped4_c(block, dest, line_size);
4490 }
4491
4492 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4493 {
4494     j_rev_dct2 (block);
4495     put_pixels_clamped2_c(block, dest, line_size);
4496 }
4497 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4498 {
4499     j_rev_dct2 (block);
4500     add_pixels_clamped2_c(block, dest, line_size);
4501 }
4502
4503 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4504 {
4505     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4506
4507     dest[0] = cm[(block[0] + 4)>>3];
4508 }
4509 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4510 {
4511     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4512
4513     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4514 }
4515
4516 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4517
4518 /* init static data */
4519 av_cold void dsputil_static_init(void)
4520 {
4521     int i;
4522
4523     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4524     for(i=0;i<MAX_NEG_CROP;i++) {
4525         ff_cropTbl[i] = 0;
4526         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4527     }
4528
4529     for(i=0;i<512;i++) {
4530         ff_squareTbl[i] = (i - 256) * (i - 256);
4531     }
4532
4533     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4534 }
4535
4536 int ff_check_alignment(void){
4537     static int did_fail=0;
4538     DECLARE_ALIGNED_16(int, aligned);
4539
4540     if((intptr_t)&aligned & 15){
4541         if(!did_fail){
4542 #if HAVE_MMX || HAVE_ALTIVEC
4543             av_log(NULL, AV_LOG_ERROR,
4544                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4545                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4546                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4547                 "Do not report crashes to FFmpeg developers.\n");
4548 #endif
4549             did_fail=1;
4550         }
4551         return -1;
4552     }
4553     return 0;
4554 }
4555
4556 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4557 {
4558     int i;
4559
4560     ff_check_alignment();
4561
4562 #if CONFIG_ENCODERS
4563     if(avctx->dct_algo==FF_DCT_FASTINT) {
4564         c->fdct = fdct_ifast;
4565         c->fdct248 = fdct_ifast248;
4566     }
4567     else if(avctx->dct_algo==FF_DCT_FAAN) {
4568         c->fdct = ff_faandct;
4569         c->fdct248 = ff_faandct248;
4570     }
4571     else {
4572         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4573         c->fdct248 = ff_fdct248_islow;
4574     }
4575 #endif //CONFIG_ENCODERS
4576
4577     if(avctx->lowres==1){
4578         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4579             c->idct_put= ff_jref_idct4_put;
4580             c->idct_add= ff_jref_idct4_add;
4581         }else{
4582             c->idct_put= ff_h264_lowres_idct_put_c;
4583             c->idct_add= ff_h264_lowres_idct_add_c;
4584         }
4585         c->idct    = j_rev_dct4;
4586         c->idct_permutation_type= FF_NO_IDCT_PERM;
4587     }else if(avctx->lowres==2){
4588         c->idct_put= ff_jref_idct2_put;
4589         c->idct_add= ff_jref_idct2_add;
4590         c->idct    = j_rev_dct2;
4591         c->idct_permutation_type= FF_NO_IDCT_PERM;
4592     }else if(avctx->lowres==3){
4593         c->idct_put= ff_jref_idct1_put;
4594         c->idct_add= ff_jref_idct1_add;
4595         c->idct    = j_rev_dct1;
4596         c->idct_permutation_type= FF_NO_IDCT_PERM;
4597     }else{
4598         if(avctx->idct_algo==FF_IDCT_INT){
4599             c->idct_put= ff_jref_idct_put;
4600             c->idct_add= ff_jref_idct_add;
4601             c->idct    = j_rev_dct;
4602             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4603         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4604                 avctx->idct_algo==FF_IDCT_VP3){
4605             c->idct_put= ff_vp3_idct_put_c;
4606             c->idct_add= ff_vp3_idct_add_c;
4607             c->idct    = ff_vp3_idct_c;
4608             c->idct_permutation_type= FF_NO_IDCT_PERM;
4609         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4610             c->idct_put= ff_wmv2_idct_put_c;
4611             c->idct_add= ff_wmv2_idct_add_c;
4612             c->idct    = ff_wmv2_idct_c;
4613             c->idct_permutation_type= FF_NO_IDCT_PERM;
4614         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4615             c->idct_put= ff_faanidct_put;
4616             c->idct_add= ff_faanidct_add;
4617             c->idct    = ff_faanidct;
4618             c->idct_permutation_type= FF_NO_IDCT_PERM;
4619         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4620             c->idct_put= ff_ea_idct_put_c;
4621             c->idct_permutation_type= FF_NO_IDCT_PERM;
4622         }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4623             c->idct     = ff_bink_idct_c;
4624             c->idct_add = ff_bink_idct_add_c;
4625             c->idct_put = ff_bink_idct_put_c;
4626             c->idct_permutation_type = FF_NO_IDCT_PERM;
4627         }else{ //accurate/default
4628             c->idct_put= ff_simple_idct_put;
4629             c->idct_add= ff_simple_idct_add;
4630             c->idct    = ff_simple_idct;
4631             c->idct_permutation_type= FF_NO_IDCT_PERM;
4632         }
4633     }
4634
4635     if (CONFIG_H264_DECODER) {
4636         c->h264_idct_add= ff_h264_idct_add_c;
4637         c->h264_idct8_add= ff_h264_idct8_add_c;
4638         c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4639         c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4640         c->h264_idct_add16     = ff_h264_idct_add16_c;
4641         c->h264_idct8_add4     = ff_h264_idct8_add4_c;
4642         c->h264_idct_add8      = ff_h264_idct_add8_c;
4643         c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
4644     }
4645
4646     c->get_pixels = get_pixels_c;
4647     c->diff_pixels = diff_pixels_c;
4648     c->put_pixels_clamped = put_pixels_clamped_c;
4649     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4650     c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4651     c->add_pixels_clamped = add_pixels_clamped_c;
4652     c->add_pixels8 = add_pixels8_c;
4653     c->add_pixels4 = add_pixels4_c;
4654     c->sum_abs_dctelem = sum_abs_dctelem_c;
4655     c->gmc1 = gmc1_c;
4656     c->gmc = ff_gmc_c;
4657     c->clear_block = clear_block_c;
4658     c->clear_blocks = clear_blocks_c;
4659     c->pix_sum = pix_sum_c;
4660     c->pix_norm1 = pix_norm1_c;
4661
4662     c->fill_block_tab[0] = fill_block16_c;
4663     c->fill_block_tab[1] = fill_block8_c;
4664     c->scale_block = scale_block_c;
4665
4666     /* TODO [0] 16  [1] 8 */
4667     c->pix_abs[0][0] = pix_abs16_c;
4668     c->pix_abs[0][1] = pix_abs16_x2_c;
4669     c->pix_abs[0][2] = pix_abs16_y2_c;
4670     c->pix_abs[0][3] = pix_abs16_xy2_c;
4671     c->pix_abs[1][0] = pix_abs8_c;
4672     c->pix_abs[1][1] = pix_abs8_x2_c;
4673     c->pix_abs[1][2] = pix_abs8_y2_c;
4674     c->pix_abs[1][3] = pix_abs8_xy2_c;
4675
4676 #define dspfunc(PFX, IDX, NUM) \
4677     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4678     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4679     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4680     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4681
4682     dspfunc(put, 0, 16);
4683     dspfunc(put_no_rnd, 0, 16);
4684     dspfunc(put, 1, 8);
4685     dspfunc(put_no_rnd, 1, 8);
4686     dspfunc(put, 2, 4);
4687     dspfunc(put, 3, 2);
4688
4689     dspfunc(avg, 0, 16);
4690     dspfunc(avg_no_rnd, 0, 16);
4691     dspfunc(avg, 1, 8);
4692     dspfunc(avg_no_rnd, 1, 8);
4693     dspfunc(avg, 2, 4);
4694     dspfunc(avg, 3, 2);
4695 #undef dspfunc
4696
4697     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4698     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4699
4700     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4701     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4702     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4703     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4704     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4705     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4706     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4707     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4708     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4709
4710     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4711     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4712     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4713     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4714     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4715     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4716     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4717     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4718     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4719
4720 #define dspfunc(PFX, IDX, NUM) \
4721     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4722     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4723     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4724     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4725     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4726     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4727     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4728     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4729     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4730     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4731     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4732     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4733     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4734     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4735     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4736     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4737
4738     dspfunc(put_qpel, 0, 16);
4739     dspfunc(put_no_rnd_qpel, 0, 16);
4740
4741     dspfunc(avg_qpel, 0, 16);
4742     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4743
4744     dspfunc(put_qpel, 1, 8);
4745     dspfunc(put_no_rnd_qpel, 1, 8);
4746
4747     dspfunc(avg_qpel, 1, 8);
4748     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4749
4750     dspfunc(put_h264_qpel, 0, 16);
4751     dspfunc(put_h264_qpel, 1, 8);
4752     dspfunc(put_h264_qpel, 2, 4);
4753     dspfunc(put_h264_qpel, 3, 2);
4754     dspfunc(avg_h264_qpel, 0, 16);
4755     dspfunc(avg_h264_qpel, 1, 8);
4756     dspfunc(avg_h264_qpel, 2, 4);
4757
4758 #undef dspfunc
4759     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4760     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4761     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4762     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4763     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4764     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4765     c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4766     c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4767
4768     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4769     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4770     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4771     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4772     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4773     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4774     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4775     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4776     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4777     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4778     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4779     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4780     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4781     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4782     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4783     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4784     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4785     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4786     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4787     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4788
4789     c->draw_edges = draw_edges_c;
4790
4791 #if CONFIG_CAVS_DECODER
4792     ff_cavsdsp_init(c,avctx);
4793 #endif
4794
4795 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4796     ff_mlp_init(c, avctx);
4797 #endif
4798 #if CONFIG_VC1_DECODER
4799     ff_vc1dsp_init(c,avctx);
4800 #endif
4801 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4802     ff_intrax8dsp_init(c,avctx);
4803 #endif
4804 #if CONFIG_RV30_DECODER
4805     ff_rv30dsp_init(c,avctx);
4806 #endif
4807 #if CONFIG_RV40_DECODER
4808     ff_rv40dsp_init(c,avctx);
4809     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4810     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4811     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4812     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4813 #endif
4814
4815     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4816     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4817     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4818     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4819     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4820     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4821     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4822     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4823
4824 #define SET_CMP_FUNC(name) \
4825     c->name[0]= name ## 16_c;\
4826     c->name[1]= name ## 8x8_c;
4827
4828     SET_CMP_FUNC(hadamard8_diff)
4829     c->hadamard8_diff[4]= hadamard8_intra16_c;
4830     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4831     SET_CMP_FUNC(dct_sad)
4832     SET_CMP_FUNC(dct_max)
4833 #if CONFIG_GPL
4834     SET_CMP_FUNC(dct264_sad)
4835 #endif
4836     c->sad[0]= pix_abs16_c;
4837     c->sad[1]= pix_abs8_c;
4838     c->sse[0]= sse16_c;
4839     c->sse[1]= sse8_c;
4840     c->sse[2]= sse4_c;
4841     SET_CMP_FUNC(quant_psnr)
4842     SET_CMP_FUNC(rd)
4843     SET_CMP_FUNC(bit)
4844     c->vsad[0]= vsad16_c;
4845     c->vsad[4]= vsad_intra16_c;
4846     c->vsad[5]= vsad_intra8_c;
4847     c->vsse[0]= vsse16_c;
4848     c->vsse[4]= vsse_intra16_c;
4849     c->vsse[5]= vsse_intra8_c;
4850     c->nsse[0]= nsse16_c;
4851     c->nsse[1]= nsse8_c;
4852 #if CONFIG_SNOW_ENCODER
4853     c->w53[0]= w53_16_c;
4854     c->w53[1]= w53_8_c;
4855     c->w97[0]= w97_16_c;
4856     c->w97[1]= w97_8_c;
4857 #endif
4858
4859     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4860
4861     c->add_bytes= add_bytes_c;
4862     c->add_bytes_l2= add_bytes_l2_c;
4863     c->diff_bytes= diff_bytes_c;
4864     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4865     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4866     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
4867     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4868     c->bswap_buf= bswap_buf;
4869 #if CONFIG_PNG_DECODER
4870     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4871 #endif
4872
4873     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4874     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4875     c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
4876     c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
4877     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4878     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4879     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4880     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4881     c->h264_loop_filter_strength= NULL;
4882
4883     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4884         c->h263_h_loop_filter= h263_h_loop_filter_c;
4885         c->h263_v_loop_filter= h263_v_loop_filter_c;
4886     }
4887
4888     if (CONFIG_VP3_DECODER) {
4889         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4890         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4891     }
4892     if (CONFIG_VP6_DECODER) {
4893         c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4894     }
4895
4896     c->h261_loop_filter= h261_loop_filter_c;
4897
4898     c->try_8x8basis= try_8x8basis_c;
4899     c->add_8x8basis= add_8x8basis_c;
4900
4901 #if CONFIG_SNOW_DECODER
4902     c->vertical_compose97i = ff_snow_vertical_compose97i;
4903     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4904     c->inner_add_yblock = ff_snow_inner_add_yblock;
4905 #endif
4906
4907 #if CONFIG_VORBIS_DECODER
4908     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4909 #endif
4910 #if CONFIG_AC3_DECODER
4911     c->ac3_downmix = ff_ac3_downmix_c;
4912 #endif
4913 #if CONFIG_LPC
4914     c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
4915 #endif
4916     c->vector_fmul = vector_fmul_c;
4917     c->vector_fmul_reverse = vector_fmul_reverse_c;
4918     c->vector_fmul_add = vector_fmul_add_c;
4919     c->vector_fmul_window = ff_vector_fmul_window_c;
4920     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4921     c->vector_clipf = vector_clipf_c;
4922     c->float_to_int16 = ff_float_to_int16_c;
4923     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4924     c->scalarproduct_int16 = scalarproduct_int16_c;
4925     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4926     c->scalarproduct_float = scalarproduct_float_c;
4927     c->butterflies_float = butterflies_float_c;
4928     c->vector_fmul_scalar = vector_fmul_scalar_c;
4929
4930     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4931     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4932
4933     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4934     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4935
4936     c->shrink[0]= ff_img_copy_plane;
4937     c->shrink[1]= ff_shrink22;
4938     c->shrink[2]= ff_shrink44;
4939     c->shrink[3]= ff_shrink88;
4940
4941     c->prefetch= just_return;
4942
4943     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4944     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4945
4946     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4947     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4948     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4949     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4950     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4951     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4952     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4953     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4954     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4955
4956     for(i=0; i<64; i++){
4957         if(!c->put_2tap_qpel_pixels_tab[0][i])
4958             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4959         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4960             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4961     }
4962
4963     switch(c->idct_permutation_type){
4964     case FF_NO_IDCT_PERM:
4965         for(i=0; i<64; i++)
4966             c->idct_permutation[i]= i;
4967         break;
4968     case FF_LIBMPEG2_IDCT_PERM:
4969         for(i=0; i<64; i++)
4970             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4971         break;
4972     case FF_SIMPLE_IDCT_PERM:
4973         for(i=0; i<64; i++)
4974             c->idct_permutation[i]= simple_mmx_permutation[i];
4975         break;
4976     case FF_TRANSPOSE_IDCT_PERM:
4977         for(i=0; i<64; i++)
4978             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4979         break;
4980     case FF_PARTTRANS_IDCT_PERM:
4981         for(i=0; i<64; i++)
4982             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4983         break;
4984     case FF_SSE2_IDCT_PERM:
4985         for(i=0; i<64; i++)
4986             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4987         break;
4988     default:
4989         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4990     }
4991 }
4992