git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file dsputil.c
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "mpegvideo.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "h263.h"
  37 #include "snow.h"
  38
  39 /* snow.c */
  40 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  41
  42 /* vorbis.c */
  43 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
  44
  45 /* flacenc.c */
  46 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
  47
  48 /* pngdec.c */
  49 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
  50
  51 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  52 uint32_t ff_squareTbl[512] = {0, };
  53
  54 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  55 #define pb_7f (~0UL/255 * 0x7f)
  56 #define pb_80 (~0UL/255 * 0x80)
  57
  58 const uint8_t ff_zigzag_direct[64] = {
  59     0,   1,  8, 16,  9,  2,  3, 10,
  60     17, 24, 32, 25, 18, 11,  4,  5,
  61     12, 19, 26, 33, 40, 48, 41, 34,
  62     27, 20, 13,  6,  7, 14, 21, 28,
  63     35, 42, 49, 56, 57, 50, 43, 36,
  64     29, 22, 15, 23, 30, 37, 44, 51,
  65     58, 59, 52, 45, 38, 31, 39, 46,
  66     53, 60, 61, 54, 47, 55, 62, 63
  67 };
  68
  69 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  70    specification, we interleave the fields */
  71 const uint8_t ff_zigzag248_direct[64] = {
  72      0,  8,  1,  9, 16, 24,  2, 10,
  73     17, 25, 32, 40, 48, 56, 33, 41,
  74     18, 26,  3, 11,  4, 12, 19, 27,
  75     34, 42, 49, 57, 50, 58, 35, 43,
  76     20, 28,  5, 13,  6, 14, 21, 29,
  77     36, 44, 51, 59, 52, 60, 37, 45,
  78     22, 30,  7, 15, 23, 31, 38, 46,
  79     53, 61, 54, 62, 39, 47, 55, 63,
  80 };
  81
  82 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  83 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
  84
  85 const uint8_t ff_alternate_horizontal_scan[64] = {
  86     0,  1,   2,  3,  8,  9, 16, 17,
  87     10, 11,  4,  5,  6,  7, 15, 14,
  88     13, 12, 19, 18, 24, 25, 32, 33,
  89     26, 27, 20, 21, 22, 23, 28, 29,
  90     30, 31, 34, 35, 40, 41, 48, 49,
  91     42, 43, 36, 37, 38, 39, 44, 45,
  92     46, 47, 50, 51, 56, 57, 58, 59,
  93     52, 53, 54, 55, 60, 61, 62, 63,
  94 };
  95
  96 const uint8_t ff_alternate_vertical_scan[64] = {
  97     0,  8,  16, 24,  1,  9,  2, 10,
  98     17, 25, 32, 40, 48, 56, 57, 49,
  99     41, 33, 26, 18,  3, 11,  4, 12,
 100     19, 27, 34, 42, 50, 58, 35, 43,
 101     51, 59, 20, 28,  5, 13,  6, 14,
 102     21, 29, 36, 44, 52, 60, 37, 45,
 103     53, 61, 22, 30,  7, 15, 23, 31,
 104     38, 46, 54, 62, 39, 47, 55, 63,
 105 };
 106
 107 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
 108 const uint32_t ff_inverse[256]={
 109          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 110  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 111  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 112  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 113  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 114  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 115   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 116   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 117   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 118   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 119   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 120   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 121   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 122   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 123   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 124   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 125   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 126   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 127   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 128   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 129   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 130   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 131   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 132   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 133   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 134   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 135   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 136   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 137   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 138   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 139   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 140   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 141 };
 142
 143 /* Input permutation for the simple_idct_mmx */
 144 static const uint8_t simple_mmx_permutation[64]={
 145         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 146         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 147         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 148         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 149         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 150         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 151         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 152         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 153 };
 154
 155 static int pix_sum_c(uint8_t * pix, int line_size)
 156 {
 157     int s, i, j;
 158
 159     s = 0;
 160     for (i = 0; i < 16; i++) {
 161         for (j = 0; j < 16; j += 8) {
 162             s += pix[0];
 163             s += pix[1];
 164             s += pix[2];
 165             s += pix[3];
 166             s += pix[4];
 167             s += pix[5];
 168             s += pix[6];
 169             s += pix[7];
 170             pix += 8;
 171         }
 172         pix += line_size - 16;
 173     }
 174     return s;
 175 }
 176
 177 static int pix_norm1_c(uint8_t * pix, int line_size)
 178 {
 179     int s, i, j;
 180     uint32_t *sq = ff_squareTbl + 256;
 181
 182     s = 0;
 183     for (i = 0; i < 16; i++) {
 184         for (j = 0; j < 16; j += 8) {
 185 #if 0
 186             s += sq[pix[0]];
 187             s += sq[pix[1]];
 188             s += sq[pix[2]];
 189             s += sq[pix[3]];
 190             s += sq[pix[4]];
 191             s += sq[pix[5]];
 192             s += sq[pix[6]];
 193             s += sq[pix[7]];
 194 #else
 195 #if LONG_MAX > 2147483647
 196             register uint64_t x=*(uint64_t*)pix;
 197             s += sq[x&0xff];
 198             s += sq[(x>>8)&0xff];
 199             s += sq[(x>>16)&0xff];
 200             s += sq[(x>>24)&0xff];
 201             s += sq[(x>>32)&0xff];
 202             s += sq[(x>>40)&0xff];
 203             s += sq[(x>>48)&0xff];
 204             s += sq[(x>>56)&0xff];
 205 #else
 206             register uint32_t x=*(uint32_t*)pix;
 207             s += sq[x&0xff];
 208             s += sq[(x>>8)&0xff];
 209             s += sq[(x>>16)&0xff];
 210             s += sq[(x>>24)&0xff];
 211             x=*(uint32_t*)(pix+4);
 212             s += sq[x&0xff];
 213             s += sq[(x>>8)&0xff];
 214             s += sq[(x>>16)&0xff];
 215             s += sq[(x>>24)&0xff];
 216 #endif
 217 #endif
 218             pix += 8;
 219         }
 220         pix += line_size - 16;
 221     }
 222     return s;
 223 }
 224
 225 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 226     int i;
 227
 228     for(i=0; i+8<=w; i+=8){
 229         dst[i+0]= bswap_32(src[i+0]);
 230         dst[i+1]= bswap_32(src[i+1]);
 231         dst[i+2]= bswap_32(src[i+2]);
 232         dst[i+3]= bswap_32(src[i+3]);
 233         dst[i+4]= bswap_32(src[i+4]);
 234         dst[i+5]= bswap_32(src[i+5]);
 235         dst[i+6]= bswap_32(src[i+6]);
 236         dst[i+7]= bswap_32(src[i+7]);
 237     }
 238     for(;i<w; i++){
 239         dst[i+0]= bswap_32(src[i+0]);
 240     }
 241 }
 242
 243 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 244 {
 245     int s, i;
 246     uint32_t *sq = ff_squareTbl + 256;
 247
 248     s = 0;
 249     for (i = 0; i < h; i++) {
 250         s += sq[pix1[0] - pix2[0]];
 251         s += sq[pix1[1] - pix2[1]];
 252         s += sq[pix1[2] - pix2[2]];
 253         s += sq[pix1[3] - pix2[3]];
 254         pix1 += line_size;
 255         pix2 += line_size;
 256     }
 257     return s;
 258 }
 259
 260 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 261 {
 262     int s, i;
 263     uint32_t *sq = ff_squareTbl + 256;
 264
 265     s = 0;
 266     for (i = 0; i < h; i++) {
 267         s += sq[pix1[0] - pix2[0]];
 268         s += sq[pix1[1] - pix2[1]];
 269         s += sq[pix1[2] - pix2[2]];
 270         s += sq[pix1[3] - pix2[3]];
 271         s += sq[pix1[4] - pix2[4]];
 272         s += sq[pix1[5] - pix2[5]];
 273         s += sq[pix1[6] - pix2[6]];
 274         s += sq[pix1[7] - pix2[7]];
 275         pix1 += line_size;
 276         pix2 += line_size;
 277     }
 278     return s;
 279 }
 280
 281 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 282 {
 283     int s, i;
 284     uint32_t *sq = ff_squareTbl + 256;
 285
 286     s = 0;
 287     for (i = 0; i < h; i++) {
 288         s += sq[pix1[ 0] - pix2[ 0]];
 289         s += sq[pix1[ 1] - pix2[ 1]];
 290         s += sq[pix1[ 2] - pix2[ 2]];
 291         s += sq[pix1[ 3] - pix2[ 3]];
 292         s += sq[pix1[ 4] - pix2[ 4]];
 293         s += sq[pix1[ 5] - pix2[ 5]];
 294         s += sq[pix1[ 6] - pix2[ 6]];
 295         s += sq[pix1[ 7] - pix2[ 7]];
 296         s += sq[pix1[ 8] - pix2[ 8]];
 297         s += sq[pix1[ 9] - pix2[ 9]];
 298         s += sq[pix1[10] - pix2[10]];
 299         s += sq[pix1[11] - pix2[11]];
 300         s += sq[pix1[12] - pix2[12]];
 301         s += sq[pix1[13] - pix2[13]];
 302         s += sq[pix1[14] - pix2[14]];
 303         s += sq[pix1[15] - pix2[15]];
 304
 305         pix1 += line_size;
 306         pix2 += line_size;
 307     }
 308     return s;
 309 }
 310
 311
 312 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
 313 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 314     int s, i, j;
 315     const int dec_count= w==8 ? 3 : 4;
 316     int tmp[32*32];
 317     int level, ori;
 318     static const int scale[2][2][4][4]={
 319       {
 320         {
 321             // 9/7 8x8 dec=3
 322             {268, 239, 239, 213},
 323             {  0, 224, 224, 152},
 324             {  0, 135, 135, 110},
 325         },{
 326             // 9/7 16x16 or 32x32 dec=4
 327             {344, 310, 310, 280},
 328             {  0, 320, 320, 228},
 329             {  0, 175, 175, 136},
 330             {  0, 129, 129, 102},
 331         }
 332       },{
 333         {
 334             // 5/3 8x8 dec=3
 335             {275, 245, 245, 218},
 336             {  0, 230, 230, 156},
 337             {  0, 138, 138, 113},
 338         },{
 339             // 5/3 16x16 or 32x32 dec=4
 340             {352, 317, 317, 286},
 341             {  0, 328, 328, 233},
 342             {  0, 180, 180, 140},
 343             {  0, 132, 132, 105},
 344         }
 345       }
 346     };
 347
 348     for (i = 0; i < h; i++) {
 349         for (j = 0; j < w; j+=4) {
 350             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 351             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 352             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 353             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 354         }
 355         pix1 += line_size;
 356         pix2 += line_size;
 357     }
 358
 359     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 360
 361     s=0;
 362     assert(w==h);
 363     for(level=0; level<dec_count; level++){
 364         for(ori= level ? 1 : 0; ori<4; ori++){
 365             int size= w>>(dec_count-level);
 366             int sx= (ori&1) ? size : 0;
 367             int stride= 32<<(dec_count-level);
 368             int sy= (ori&2) ? stride>>1 : 0;
 369
 370             for(i=0; i<size; i++){
 371                 for(j=0; j<size; j++){
 372                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 373                     s += FFABS(v);
 374                 }
 375             }
 376         }
 377     }
 378     assert(s>=0);
 379     return s>>9;
 380 }
 381
 382 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 383     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 384 }
 385
 386 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 387     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 388 }
 389
 390 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 391     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 392 }
 393
 394 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 395     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 396 }
 397
 398 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 399     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 400 }
 401
 402 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 403     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 404 }
 405 #endif
 406
 407 /* draw the edges of width 'w' of an image of size width, height */
 408 //FIXME check that this is ok for mpeg4 interlaced
 409 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 410 {
 411     uint8_t *ptr, *last_line;
 412     int i;
 413
 414     last_line = buf + (height - 1) * wrap;
 415     for(i=0;i<w;i++) {
 416         /* top and bottom */
 417         memcpy(buf - (i + 1) * wrap, buf, width);
 418         memcpy(last_line + (i + 1) * wrap, last_line, width);
 419     }
 420     /* left and right */
 421     ptr = buf;
 422     for(i=0;i<height;i++) {
 423         memset(ptr - w, ptr[0], w);
 424         memset(ptr + width, ptr[width-1], w);
 425         ptr += wrap;
 426     }
 427     /* corners */
 428     for(i=0;i<w;i++) {
 429         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 430         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 431         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 432         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 433     }
 434 }
 435
 436 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 437 {
 438     int i;
 439
 440     /* read the pixels */
 441     for(i=0;i<8;i++) {
 442         block[0] = pixels[0];
 443         block[1] = pixels[1];
 444         block[2] = pixels[2];
 445         block[3] = pixels[3];
 446         block[4] = pixels[4];
 447         block[5] = pixels[5];
 448         block[6] = pixels[6];
 449         block[7] = pixels[7];
 450         pixels += line_size;
 451         block += 8;
 452     }
 453 }
 454
 455 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 456                           const uint8_t *s2, int stride){
 457     int i;
 458
 459     /* read the pixels */
 460     for(i=0;i<8;i++) {
 461         block[0] = s1[0] - s2[0];
 462         block[1] = s1[1] - s2[1];
 463         block[2] = s1[2] - s2[2];
 464         block[3] = s1[3] - s2[3];
 465         block[4] = s1[4] - s2[4];
 466         block[5] = s1[5] - s2[5];
 467         block[6] = s1[6] - s2[6];
 468         block[7] = s1[7] - s2[7];
 469         s1 += stride;
 470         s2 += stride;
 471         block += 8;
 472     }
 473 }
 474
 475
 476 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 477                                  int line_size)
 478 {
 479     int i;
 480     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 481
 482     /* read the pixels */
 483     for(i=0;i<8;i++) {
 484         pixels[0] = cm[block[0]];
 485         pixels[1] = cm[block[1]];
 486         pixels[2] = cm[block[2]];
 487         pixels[3] = cm[block[3]];
 488         pixels[4] = cm[block[4]];
 489         pixels[5] = cm[block[5]];
 490         pixels[6] = cm[block[6]];
 491         pixels[7] = cm[block[7]];
 492
 493         pixels += line_size;
 494         block += 8;
 495     }
 496 }
 497
 498 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 499                                  int line_size)
 500 {
 501     int i;
 502     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 503
 504     /* read the pixels */
 505     for(i=0;i<4;i++) {
 506         pixels[0] = cm[block[0]];
 507         pixels[1] = cm[block[1]];
 508         pixels[2] = cm[block[2]];
 509         pixels[3] = cm[block[3]];
 510
 511         pixels += line_size;
 512         block += 8;
 513     }
 514 }
 515
 516 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 517                                  int line_size)
 518 {
 519     int i;
 520     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 521
 522     /* read the pixels */
 523     for(i=0;i<2;i++) {
 524         pixels[0] = cm[block[0]];
 525         pixels[1] = cm[block[1]];
 526
 527         pixels += line_size;
 528         block += 8;
 529     }
 530 }
 531
 532 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 533                                         uint8_t *restrict pixels,
 534                                         int line_size)
 535 {
 536     int i, j;
 537
 538     for (i = 0; i < 8; i++) {
 539         for (j = 0; j < 8; j++) {
 540             if (*block < -128)
 541                 *pixels = 0;
 542             else if (*block > 127)
 543                 *pixels = 255;
 544             else
 545                 *pixels = (uint8_t)(*block + 128);
 546             block++;
 547             pixels++;
 548         }
 549         pixels += (line_size - 8);
 550     }
 551 }
 552
 553 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 554                           int line_size)
 555 {
 556     int i;
 557     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 558
 559     /* read the pixels */
 560     for(i=0;i<8;i++) {
 561         pixels[0] = cm[pixels[0] + block[0]];
 562         pixels[1] = cm[pixels[1] + block[1]];
 563         pixels[2] = cm[pixels[2] + block[2]];
 564         pixels[3] = cm[pixels[3] + block[3]];
 565         pixels[4] = cm[pixels[4] + block[4]];
 566         pixels[5] = cm[pixels[5] + block[5]];
 567         pixels[6] = cm[pixels[6] + block[6]];
 568         pixels[7] = cm[pixels[7] + block[7]];
 569         pixels += line_size;
 570         block += 8;
 571     }
 572 }
 573
 574 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 575                           int line_size)
 576 {
 577     int i;
 578     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 579
 580     /* read the pixels */
 581     for(i=0;i<4;i++) {
 582         pixels[0] = cm[pixels[0] + block[0]];
 583         pixels[1] = cm[pixels[1] + block[1]];
 584         pixels[2] = cm[pixels[2] + block[2]];
 585         pixels[3] = cm[pixels[3] + block[3]];
 586         pixels += line_size;
 587         block += 8;
 588     }
 589 }
 590
 591 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 592                           int line_size)
 593 {
 594     int i;
 595     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 596
 597     /* read the pixels */
 598     for(i=0;i<2;i++) {
 599         pixels[0] = cm[pixels[0] + block[0]];
 600         pixels[1] = cm[pixels[1] + block[1]];
 601         pixels += line_size;
 602         block += 8;
 603     }
 604 }
 605
 606 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 607 {
 608     int i;
 609     for(i=0;i<8;i++) {
 610         pixels[0] += block[0];
 611         pixels[1] += block[1];
 612         pixels[2] += block[2];
 613         pixels[3] += block[3];
 614         pixels[4] += block[4];
 615         pixels[5] += block[5];
 616         pixels[6] += block[6];
 617         pixels[7] += block[7];
 618         pixels += line_size;
 619         block += 8;
 620     }
 621 }
 622
 623 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 624 {
 625     int i;
 626     for(i=0;i<4;i++) {
 627         pixels[0] += block[0];
 628         pixels[1] += block[1];
 629         pixels[2] += block[2];
 630         pixels[3] += block[3];
 631         pixels += line_size;
 632         block += 4;
 633     }
 634 }
 635
 636 static int sum_abs_dctelem_c(DCTELEM *block)
 637 {
 638     int sum=0, i;
 639     for(i=0; i<64; i++)
 640         sum+= FFABS(block[i]);
 641     return sum;
 642 }
 643
 644 #if 0
 645
 646 #define PIXOP2(OPNAME, OP) \
 647 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 648 {\
 649     int i;\
 650     for(i=0; i<h; i++){\
 651         OP(*((uint64_t*)block), AV_RN64(pixels));\
 652         pixels+=line_size;\
 653         block +=line_size;\
 654     }\
 655 }\
 656 \
 657 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 658 {\
 659     int i;\
 660     for(i=0; i<h; i++){\
 661         const uint64_t a= AV_RN64(pixels  );\
 662         const uint64_t b= AV_RN64(pixels+1);\
 663         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 664         pixels+=line_size;\
 665         block +=line_size;\
 666     }\
 667 }\
 668 \
 669 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 670 {\
 671     int i;\
 672     for(i=0; i<h; i++){\
 673         const uint64_t a= AV_RN64(pixels  );\
 674         const uint64_t b= AV_RN64(pixels+1);\
 675         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 676         pixels+=line_size;\
 677         block +=line_size;\
 678     }\
 679 }\
 680 \
 681 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 682 {\
 683     int i;\
 684     for(i=0; i<h; i++){\
 685         const uint64_t a= AV_RN64(pixels          );\
 686         const uint64_t b= AV_RN64(pixels+line_size);\
 687         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 688         pixels+=line_size;\
 689         block +=line_size;\
 690     }\
 691 }\
 692 \
 693 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 694 {\
 695     int i;\
 696     for(i=0; i<h; i++){\
 697         const uint64_t a= AV_RN64(pixels          );\
 698         const uint64_t b= AV_RN64(pixels+line_size);\
 699         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 700         pixels+=line_size;\
 701         block +=line_size;\
 702     }\
 703 }\
 704 \
 705 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 706 {\
 707         int i;\
 708         const uint64_t a= AV_RN64(pixels  );\
 709         const uint64_t b= AV_RN64(pixels+1);\
 710         uint64_t l0=  (a&0x0303030303030303ULL)\
 711                     + (b&0x0303030303030303ULL)\
 712                     + 0x0202020202020202ULL;\
 713         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 714                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 715         uint64_t l1,h1;\
 716 \
 717         pixels+=line_size;\
 718         for(i=0; i<h; i+=2){\
 719             uint64_t a= AV_RN64(pixels  );\
 720             uint64_t b= AV_RN64(pixels+1);\
 721             l1=  (a&0x0303030303030303ULL)\
 722                + (b&0x0303030303030303ULL);\
 723             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 724               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 725             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 726             pixels+=line_size;\
 727             block +=line_size;\
 728             a= AV_RN64(pixels  );\
 729             b= AV_RN64(pixels+1);\
 730             l0=  (a&0x0303030303030303ULL)\
 731                + (b&0x0303030303030303ULL)\
 732                + 0x0202020202020202ULL;\
 733             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 734               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 735             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 736             pixels+=line_size;\
 737             block +=line_size;\
 738         }\
 739 }\
 740 \
 741 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 742 {\
 743         int i;\
 744         const uint64_t a= AV_RN64(pixels  );\
 745         const uint64_t b= AV_RN64(pixels+1);\
 746         uint64_t l0=  (a&0x0303030303030303ULL)\
 747                     + (b&0x0303030303030303ULL)\
 748                     + 0x0101010101010101ULL;\
 749         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 750                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 751         uint64_t l1,h1;\
 752 \
 753         pixels+=line_size;\
 754         for(i=0; i<h; i+=2){\
 755             uint64_t a= AV_RN64(pixels  );\
 756             uint64_t b= AV_RN64(pixels+1);\
 757             l1=  (a&0x0303030303030303ULL)\
 758                + (b&0x0303030303030303ULL);\
 759             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 760               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 761             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 762             pixels+=line_size;\
 763             block +=line_size;\
 764             a= AV_RN64(pixels  );\
 765             b= AV_RN64(pixels+1);\
 766             l0=  (a&0x0303030303030303ULL)\
 767                + (b&0x0303030303030303ULL)\
 768                + 0x0101010101010101ULL;\
 769             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 770               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 771             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 772             pixels+=line_size;\
 773             block +=line_size;\
 774         }\
 775 }\
 776 \
 777 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 778 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 779 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 780 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 781 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 782 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 783 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 784
 785 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 786 #else // 64 bit variant
 787
 788 #define PIXOP2(OPNAME, OP) \
 789 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 790     int i;\
 791     for(i=0; i<h; i++){\
 792         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 793         pixels+=line_size;\
 794         block +=line_size;\
 795     }\
 796 }\
 797 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 798     int i;\
 799     for(i=0; i<h; i++){\
 800         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 801         pixels+=line_size;\
 802         block +=line_size;\
 803     }\
 804 }\
 805 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 806     int i;\
 807     for(i=0; i<h; i++){\
 808         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 809         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 810         pixels+=line_size;\
 811         block +=line_size;\
 812     }\
 813 }\
 814 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 815     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 816 }\
 817 \
 818 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 819                                                 int src_stride1, int src_stride2, int h){\
 820     int i;\
 821     for(i=0; i<h; i++){\
 822         uint32_t a,b;\
 823         a= AV_RN32(&src1[i*src_stride1  ]);\
 824         b= AV_RN32(&src2[i*src_stride2  ]);\
 825         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 826         a= AV_RN32(&src1[i*src_stride1+4]);\
 827         b= AV_RN32(&src2[i*src_stride2+4]);\
 828         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 829     }\
 830 }\
 831 \
 832 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 833                                                 int src_stride1, int src_stride2, int h){\
 834     int i;\
 835     for(i=0; i<h; i++){\
 836         uint32_t a,b;\
 837         a= AV_RN32(&src1[i*src_stride1  ]);\
 838         b= AV_RN32(&src2[i*src_stride2  ]);\
 839         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 840         a= AV_RN32(&src1[i*src_stride1+4]);\
 841         b= AV_RN32(&src2[i*src_stride2+4]);\
 842         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 843     }\
 844 }\
 845 \
 846 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 847                                                 int src_stride1, int src_stride2, int h){\
 848     int i;\
 849     for(i=0; i<h; i++){\
 850         uint32_t a,b;\
 851         a= AV_RN32(&src1[i*src_stride1  ]);\
 852         b= AV_RN32(&src2[i*src_stride2  ]);\
 853         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 854     }\
 855 }\
 856 \
 857 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 858                                                 int src_stride1, int src_stride2, int h){\
 859     int i;\
 860     for(i=0; i<h; i++){\
 861         uint32_t a,b;\
 862         a= AV_RN16(&src1[i*src_stride1  ]);\
 863         b= AV_RN16(&src2[i*src_stride2  ]);\
 864         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 865     }\
 866 }\
 867 \
 868 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 869                                                 int src_stride1, int src_stride2, int h){\
 870     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 871     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 872 }\
 873 \
 874 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 875                                                 int src_stride1, int src_stride2, int h){\
 876     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 877     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 878 }\
 879 \
 880 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 881     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 882 }\
 883 \
 884 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 885     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 886 }\
 887 \
 888 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 889     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 890 }\
 891 \
 892 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 893     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 894 }\
 895 \
 896 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 897                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 898     int i;\
 899     for(i=0; i<h; i++){\
 900         uint32_t a, b, c, d, l0, l1, h0, h1;\
 901         a= AV_RN32(&src1[i*src_stride1]);\
 902         b= AV_RN32(&src2[i*src_stride2]);\
 903         c= AV_RN32(&src3[i*src_stride3]);\
 904         d= AV_RN32(&src4[i*src_stride4]);\
 905         l0=  (a&0x03030303UL)\
 906            + (b&0x03030303UL)\
 907            + 0x02020202UL;\
 908         h0= ((a&0xFCFCFCFCUL)>>2)\
 909           + ((b&0xFCFCFCFCUL)>>2);\
 910         l1=  (c&0x03030303UL)\
 911            + (d&0x03030303UL);\
 912         h1= ((c&0xFCFCFCFCUL)>>2)\
 913           + ((d&0xFCFCFCFCUL)>>2);\
 914         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 915         a= AV_RN32(&src1[i*src_stride1+4]);\
 916         b= AV_RN32(&src2[i*src_stride2+4]);\
 917         c= AV_RN32(&src3[i*src_stride3+4]);\
 918         d= AV_RN32(&src4[i*src_stride4+4]);\
 919         l0=  (a&0x03030303UL)\
 920            + (b&0x03030303UL)\
 921            + 0x02020202UL;\
 922         h0= ((a&0xFCFCFCFCUL)>>2)\
 923           + ((b&0xFCFCFCFCUL)>>2);\
 924         l1=  (c&0x03030303UL)\
 925            + (d&0x03030303UL);\
 926         h1= ((c&0xFCFCFCFCUL)>>2)\
 927           + ((d&0xFCFCFCFCUL)>>2);\
 928         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 929     }\
 930 }\
 931 \
 932 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 933     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 934 }\
 935 \
 936 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 937     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 938 }\
 939 \
 940 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 941     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 942 }\
 943 \
 944 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 945     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 946 }\
 947 \
 948 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 949                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 950     int i;\
 951     for(i=0; i<h; i++){\
 952         uint32_t a, b, c, d, l0, l1, h0, h1;\
 953         a= AV_RN32(&src1[i*src_stride1]);\
 954         b= AV_RN32(&src2[i*src_stride2]);\
 955         c= AV_RN32(&src3[i*src_stride3]);\
 956         d= AV_RN32(&src4[i*src_stride4]);\
 957         l0=  (a&0x03030303UL)\
 958            + (b&0x03030303UL)\
 959            + 0x01010101UL;\
 960         h0= ((a&0xFCFCFCFCUL)>>2)\
 961           + ((b&0xFCFCFCFCUL)>>2);\
 962         l1=  (c&0x03030303UL)\
 963            + (d&0x03030303UL);\
 964         h1= ((c&0xFCFCFCFCUL)>>2)\
 965           + ((d&0xFCFCFCFCUL)>>2);\
 966         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 967         a= AV_RN32(&src1[i*src_stride1+4]);\
 968         b= AV_RN32(&src2[i*src_stride2+4]);\
 969         c= AV_RN32(&src3[i*src_stride3+4]);\
 970         d= AV_RN32(&src4[i*src_stride4+4]);\
 971         l0=  (a&0x03030303UL)\
 972            + (b&0x03030303UL)\
 973            + 0x01010101UL;\
 974         h0= ((a&0xFCFCFCFCUL)>>2)\
 975           + ((b&0xFCFCFCFCUL)>>2);\
 976         l1=  (c&0x03030303UL)\
 977            + (d&0x03030303UL);\
 978         h1= ((c&0xFCFCFCFCUL)>>2)\
 979           + ((d&0xFCFCFCFCUL)>>2);\
 980         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 981     }\
 982 }\
 983 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 984                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 985     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 986     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 987 }\
 988 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 989                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 990     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 991     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 992 }\
 993 \
 994 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 995 {\
 996         int i, a0, b0, a1, b1;\
 997         a0= pixels[0];\
 998         b0= pixels[1] + 2;\
 999         a0 += b0;\
1000         b0 += pixels[2];\
1001 \
1002         pixels+=line_size;\
1003         for(i=0; i<h; i+=2){\
1004             a1= pixels[0];\
1005             b1= pixels[1];\
1006             a1 += b1;\
1007             b1 += pixels[2];\
1008 \
1009             block[0]= (a1+a0)>>2; /* FIXME non put */\
1010             block[1]= (b1+b0)>>2;\
1011 \
1012             pixels+=line_size;\
1013             block +=line_size;\
1014 \
1015             a0= pixels[0];\
1016             b0= pixels[1] + 2;\
1017             a0 += b0;\
1018             b0 += pixels[2];\
1019 \
1020             block[0]= (a1+a0)>>2;\
1021             block[1]= (b1+b0)>>2;\
1022             pixels+=line_size;\
1023             block +=line_size;\
1024         }\
1025 }\
1026 \
1027 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1028 {\
1029         int i;\
1030         const uint32_t a= AV_RN32(pixels  );\
1031         const uint32_t b= AV_RN32(pixels+1);\
1032         uint32_t l0=  (a&0x03030303UL)\
1033                     + (b&0x03030303UL)\
1034                     + 0x02020202UL;\
1035         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1036                    + ((b&0xFCFCFCFCUL)>>2);\
1037         uint32_t l1,h1;\
1038 \
1039         pixels+=line_size;\
1040         for(i=0; i<h; i+=2){\
1041             uint32_t a= AV_RN32(pixels  );\
1042             uint32_t b= AV_RN32(pixels+1);\
1043             l1=  (a&0x03030303UL)\
1044                + (b&0x03030303UL);\
1045             h1= ((a&0xFCFCFCFCUL)>>2)\
1046               + ((b&0xFCFCFCFCUL)>>2);\
1047             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1048             pixels+=line_size;\
1049             block +=line_size;\
1050             a= AV_RN32(pixels  );\
1051             b= AV_RN32(pixels+1);\
1052             l0=  (a&0x03030303UL)\
1053                + (b&0x03030303UL)\
1054                + 0x02020202UL;\
1055             h0= ((a&0xFCFCFCFCUL)>>2)\
1056               + ((b&0xFCFCFCFCUL)>>2);\
1057             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1058             pixels+=line_size;\
1059             block +=line_size;\
1060         }\
1061 }\
1062 \
1063 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1064 {\
1065     int j;\
1066     for(j=0; j<2; j++){\
1067         int i;\
1068         const uint32_t a= AV_RN32(pixels  );\
1069         const uint32_t b= AV_RN32(pixels+1);\
1070         uint32_t l0=  (a&0x03030303UL)\
1071                     + (b&0x03030303UL)\
1072                     + 0x02020202UL;\
1073         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1074                    + ((b&0xFCFCFCFCUL)>>2);\
1075         uint32_t l1,h1;\
1076 \
1077         pixels+=line_size;\
1078         for(i=0; i<h; i+=2){\
1079             uint32_t a= AV_RN32(pixels  );\
1080             uint32_t b= AV_RN32(pixels+1);\
1081             l1=  (a&0x03030303UL)\
1082                + (b&0x03030303UL);\
1083             h1= ((a&0xFCFCFCFCUL)>>2)\
1084               + ((b&0xFCFCFCFCUL)>>2);\
1085             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1086             pixels+=line_size;\
1087             block +=line_size;\
1088             a= AV_RN32(pixels  );\
1089             b= AV_RN32(pixels+1);\
1090             l0=  (a&0x03030303UL)\
1091                + (b&0x03030303UL)\
1092                + 0x02020202UL;\
1093             h0= ((a&0xFCFCFCFCUL)>>2)\
1094               + ((b&0xFCFCFCFCUL)>>2);\
1095             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1096             pixels+=line_size;\
1097             block +=line_size;\
1098         }\
1099         pixels+=4-line_size*(h+1);\
1100         block +=4-line_size*h;\
1101     }\
1102 }\
1103 \
1104 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1105 {\
1106     int j;\
1107     for(j=0; j<2; j++){\
1108         int i;\
1109         const uint32_t a= AV_RN32(pixels  );\
1110         const uint32_t b= AV_RN32(pixels+1);\
1111         uint32_t l0=  (a&0x03030303UL)\
1112                     + (b&0x03030303UL)\
1113                     + 0x01010101UL;\
1114         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1115                    + ((b&0xFCFCFCFCUL)>>2);\
1116         uint32_t l1,h1;\
1117 \
1118         pixels+=line_size;\
1119         for(i=0; i<h; i+=2){\
1120             uint32_t a= AV_RN32(pixels  );\
1121             uint32_t b= AV_RN32(pixels+1);\
1122             l1=  (a&0x03030303UL)\
1123                + (b&0x03030303UL);\
1124             h1= ((a&0xFCFCFCFCUL)>>2)\
1125               + ((b&0xFCFCFCFCUL)>>2);\
1126             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1127             pixels+=line_size;\
1128             block +=line_size;\
1129             a= AV_RN32(pixels  );\
1130             b= AV_RN32(pixels+1);\
1131             l0=  (a&0x03030303UL)\
1132                + (b&0x03030303UL)\
1133                + 0x01010101UL;\
1134             h0= ((a&0xFCFCFCFCUL)>>2)\
1135               + ((b&0xFCFCFCFCUL)>>2);\
1136             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1137             pixels+=line_size;\
1138             block +=line_size;\
1139         }\
1140         pixels+=4-line_size*(h+1);\
1141         block +=4-line_size*h;\
1142     }\
1143 }\
1144 \
1145 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1146 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1147 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1148 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1149 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1150 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1151 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1152 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1153
1154 #define op_avg(a, b) a = rnd_avg32(a, b)
1155 #endif
1156 #define op_put(a, b) a = b
1157
1158 PIXOP2(avg, op_avg)
1159 PIXOP2(put, op_put)
1160 #undef op_avg
1161 #undef op_put
1162
1163 #define avg2(a,b) ((a+b+1)>>1)
1164 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1165
1166 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1167     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1168 }
1169
1170 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1171     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1172 }
1173
1174 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1175 {
1176     const int A=(16-x16)*(16-y16);
1177     const int B=(   x16)*(16-y16);
1178     const int C=(16-x16)*(   y16);
1179     const int D=(   x16)*(   y16);
1180     int i;
1181
1182     for(i=0; i<h; i++)
1183     {
1184         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1185         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1186         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1187         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1188         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1189         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1190         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1191         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1192         dst+= stride;
1193         src+= stride;
1194     }
1195 }
1196
1197 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1198                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1199 {
1200     int y, vx, vy;
1201     const int s= 1<<shift;
1202
1203     width--;
1204     height--;
1205
1206     for(y=0; y<h; y++){
1207         int x;
1208
1209         vx= ox;
1210         vy= oy;
1211         for(x=0; x<8; x++){ //XXX FIXME optimize
1212             int src_x, src_y, frac_x, frac_y, index;
1213
1214             src_x= vx>>16;
1215             src_y= vy>>16;
1216             frac_x= src_x&(s-1);
1217             frac_y= src_y&(s-1);
1218             src_x>>=shift;
1219             src_y>>=shift;
1220
1221             if((unsigned)src_x < width){
1222                 if((unsigned)src_y < height){
1223                     index= src_x + src_y*stride;
1224                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1225                                            + src[index       +1]*   frac_x )*(s-frac_y)
1226                                         + (  src[index+stride  ]*(s-frac_x)
1227                                            + src[index+stride+1]*   frac_x )*   frac_y
1228                                         + r)>>(shift*2);
1229                 }else{
1230                     index= src_x + av_clip(src_y, 0, height)*stride;
1231                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1232                                           + src[index       +1]*   frac_x )*s
1233                                         + r)>>(shift*2);
1234                 }
1235             }else{
1236                 if((unsigned)src_y < height){
1237                     index= av_clip(src_x, 0, width) + src_y*stride;
1238                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1239                                            + src[index+stride  ]*   frac_y )*s
1240                                         + r)>>(shift*2);
1241                 }else{
1242                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1243                     dst[y*stride + x]=    src[index         ];
1244                 }
1245             }
1246
1247             vx+= dxx;
1248             vy+= dyx;
1249         }
1250         ox += dxy;
1251         oy += dyy;
1252     }
1253 }
1254
1255 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1256     switch(width){
1257     case 2: put_pixels2_c (dst, src, stride, height); break;
1258     case 4: put_pixels4_c (dst, src, stride, height); break;
1259     case 8: put_pixels8_c (dst, src, stride, height); break;
1260     case 16:put_pixels16_c(dst, src, stride, height); break;
1261     }
1262 }
1263
1264 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1265     int i,j;
1266     for (i=0; i < height; i++) {
1267       for (j=0; j < width; j++) {
1268         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1269       }
1270       src += stride;
1271       dst += stride;
1272     }
1273 }
1274
1275 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1276     int i,j;
1277     for (i=0; i < height; i++) {
1278       for (j=0; j < width; j++) {
1279         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1280       }
1281       src += stride;
1282       dst += stride;
1283     }
1284 }
1285
1286 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1287     int i,j;
1288     for (i=0; i < height; i++) {
1289       for (j=0; j < width; j++) {
1290         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1291       }
1292       src += stride;
1293       dst += stride;
1294     }
1295 }
1296
1297 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1298     int i,j;
1299     for (i=0; i < height; i++) {
1300       for (j=0; j < width; j++) {
1301         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1302       }
1303       src += stride;
1304       dst += stride;
1305     }
1306 }
1307
1308 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1309     int i,j;
1310     for (i=0; i < height; i++) {
1311       for (j=0; j < width; j++) {
1312         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1313       }
1314       src += stride;
1315       dst += stride;
1316     }
1317 }
1318
1319 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1320     int i,j;
1321     for (i=0; i < height; i++) {
1322       for (j=0; j < width; j++) {
1323         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1324       }
1325       src += stride;
1326       dst += stride;
1327     }
1328 }
1329
1330 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1331     int i,j;
1332     for (i=0; i < height; i++) {
1333       for (j=0; j < width; j++) {
1334         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1335       }
1336       src += stride;
1337       dst += stride;
1338     }
1339 }
1340
1341 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1342     int i,j;
1343     for (i=0; i < height; i++) {
1344       for (j=0; j < width; j++) {
1345         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1346       }
1347       src += stride;
1348       dst += stride;
1349     }
1350 }
1351
1352 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1353     switch(width){
1354     case 2: avg_pixels2_c (dst, src, stride, height); break;
1355     case 4: avg_pixels4_c (dst, src, stride, height); break;
1356     case 8: avg_pixels8_c (dst, src, stride, height); break;
1357     case 16:avg_pixels16_c(dst, src, stride, height); break;
1358     }
1359 }
1360
1361 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1362     int i,j;
1363     for (i=0; i < height; i++) {
1364       for (j=0; j < width; j++) {
1365         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1366       }
1367       src += stride;
1368       dst += stride;
1369     }
1370 }
1371
1372 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1373     int i,j;
1374     for (i=0; i < height; i++) {
1375       for (j=0; j < width; j++) {
1376         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1377       }
1378       src += stride;
1379       dst += stride;
1380     }
1381 }
1382
1383 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1384     int i,j;
1385     for (i=0; i < height; i++) {
1386       for (j=0; j < width; j++) {
1387         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1388       }
1389       src += stride;
1390       dst += stride;
1391     }
1392 }
1393
1394 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1395     int i,j;
1396     for (i=0; i < height; i++) {
1397       for (j=0; j < width; j++) {
1398         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1399       }
1400       src += stride;
1401       dst += stride;
1402     }
1403 }
1404
1405 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1406     int i,j;
1407     for (i=0; i < height; i++) {
1408       for (j=0; j < width; j++) {
1409         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1410       }
1411       src += stride;
1412       dst += stride;
1413     }
1414 }
1415
1416 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1417     int i,j;
1418     for (i=0; i < height; i++) {
1419       for (j=0; j < width; j++) {
1420         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1421       }
1422       src += stride;
1423       dst += stride;
1424     }
1425 }
1426
1427 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1428     int i,j;
1429     for (i=0; i < height; i++) {
1430       for (j=0; j < width; j++) {
1431         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1432       }
1433       src += stride;
1434       dst += stride;
1435     }
1436 }
1437
1438 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1439     int i,j;
1440     for (i=0; i < height; i++) {
1441       for (j=0; j < width; j++) {
1442         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1443       }
1444       src += stride;
1445       dst += stride;
1446     }
1447 }
1448 #if 0
1449 #define TPEL_WIDTH(width)\
1450 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1451     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1452 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1453     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1454 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1455     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1456 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1457     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1458 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1459     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1460 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1461     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1462 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1463     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1464 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1465     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1466 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1467     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1468 #endif
1469
1470 #define H264_CHROMA_MC(OPNAME, OP)\
1471 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1472     const int A=(8-x)*(8-y);\
1473     const int B=(  x)*(8-y);\
1474     const int C=(8-x)*(  y);\
1475     const int D=(  x)*(  y);\
1476     int i;\
1477     \
1478     assert(x<8 && y<8 && x>=0 && y>=0);\
1479 \
1480     if(D){\
1481         for(i=0; i<h; i++){\
1482             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1483             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1484             dst+= stride;\
1485             src+= stride;\
1486         }\
1487     }else{\
1488         const int E= B+C;\
1489         const int step= C ? stride : 1;\
1490         for(i=0; i<h; i++){\
1491             OP(dst[0], (A*src[0] + E*src[step+0]));\
1492             OP(dst[1], (A*src[1] + E*src[step+1]));\
1493             dst+= stride;\
1494             src+= stride;\
1495         }\
1496     }\
1497 }\
1498 \
1499 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1500     const int A=(8-x)*(8-y);\
1501     const int B=(  x)*(8-y);\
1502     const int C=(8-x)*(  y);\
1503     const int D=(  x)*(  y);\
1504     int i;\
1505     \
1506     assert(x<8 && y<8 && x>=0 && y>=0);\
1507 \
1508     if(D){\
1509         for(i=0; i<h; i++){\
1510             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1511             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1512             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1513             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1514             dst+= stride;\
1515             src+= stride;\
1516         }\
1517     }else{\
1518         const int E= B+C;\
1519         const int step= C ? stride : 1;\
1520         for(i=0; i<h; i++){\
1521             OP(dst[0], (A*src[0] + E*src[step+0]));\
1522             OP(dst[1], (A*src[1] + E*src[step+1]));\
1523             OP(dst[2], (A*src[2] + E*src[step+2]));\
1524             OP(dst[3], (A*src[3] + E*src[step+3]));\
1525             dst+= stride;\
1526             src+= stride;\
1527         }\
1528     }\
1529 }\
1530 \
1531 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1532     const int A=(8-x)*(8-y);\
1533     const int B=(  x)*(8-y);\
1534     const int C=(8-x)*(  y);\
1535     const int D=(  x)*(  y);\
1536     int i;\
1537     \
1538     assert(x<8 && y<8 && x>=0 && y>=0);\
1539 \
1540     if(D){\
1541         for(i=0; i<h; i++){\
1542             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1543             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1544             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1545             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1546             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1547             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1548             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1549             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1550             dst+= stride;\
1551             src+= stride;\
1552         }\
1553     }else{\
1554         const int E= B+C;\
1555         const int step= C ? stride : 1;\
1556         for(i=0; i<h; i++){\
1557             OP(dst[0], (A*src[0] + E*src[step+0]));\
1558             OP(dst[1], (A*src[1] + E*src[step+1]));\
1559             OP(dst[2], (A*src[2] + E*src[step+2]));\
1560             OP(dst[3], (A*src[3] + E*src[step+3]));\
1561             OP(dst[4], (A*src[4] + E*src[step+4]));\
1562             OP(dst[5], (A*src[5] + E*src[step+5]));\
1563             OP(dst[6], (A*src[6] + E*src[step+6]));\
1564             OP(dst[7], (A*src[7] + E*src[step+7]));\
1565             dst+= stride;\
1566             src+= stride;\
1567         }\
1568     }\
1569 }
1570
1571 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1572 #define op_put(a, b) a = (((b) + 32)>>6)
1573
1574 H264_CHROMA_MC(put_       , op_put)
1575 H264_CHROMA_MC(avg_       , op_avg)
1576 #undef op_avg
1577 #undef op_put
1578
1579 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1580     const int A=(8-x)*(8-y);
1581     const int B=(  x)*(8-y);
1582     const int C=(8-x)*(  y);
1583     const int D=(  x)*(  y);
1584     int i;
1585
1586     assert(x<8 && y<8 && x>=0 && y>=0);
1587
1588     for(i=0; i<h; i++)
1589     {
1590         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1591         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1592         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1593         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1594         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1595         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1596         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1597         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1598         dst+= stride;
1599         src+= stride;
1600     }
1601 }
1602
1603 #define QPEL_MC(r, OPNAME, RND, OP) \
1604 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1605     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1606     int i;\
1607     for(i=0; i<h; i++)\
1608     {\
1609         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1610         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1611         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1612         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1613         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1614         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1615         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1616         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1617         dst+=dstStride;\
1618         src+=srcStride;\
1619     }\
1620 }\
1621 \
1622 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1623     const int w=8;\
1624     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1625     int i;\
1626     for(i=0; i<w; i++)\
1627     {\
1628         const int src0= src[0*srcStride];\
1629         const int src1= src[1*srcStride];\
1630         const int src2= src[2*srcStride];\
1631         const int src3= src[3*srcStride];\
1632         const int src4= src[4*srcStride];\
1633         const int src5= src[5*srcStride];\
1634         const int src6= src[6*srcStride];\
1635         const int src7= src[7*srcStride];\
1636         const int src8= src[8*srcStride];\
1637         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1638         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1639         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1640         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1641         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1642         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1643         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1644         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1645         dst++;\
1646         src++;\
1647     }\
1648 }\
1649 \
1650 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1651     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1652     int i;\
1653     \
1654     for(i=0; i<h; i++)\
1655     {\
1656         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1657         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1658         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1659         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1660         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1661         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1662         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1663         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1664         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1665         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1666         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1667         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1668         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1669         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1670         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1671         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1672         dst+=dstStride;\
1673         src+=srcStride;\
1674     }\
1675 }\
1676 \
1677 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1678     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1679     int i;\
1680     const int w=16;\
1681     for(i=0; i<w; i++)\
1682     {\
1683         const int src0= src[0*srcStride];\
1684         const int src1= src[1*srcStride];\
1685         const int src2= src[2*srcStride];\
1686         const int src3= src[3*srcStride];\
1687         const int src4= src[4*srcStride];\
1688         const int src5= src[5*srcStride];\
1689         const int src6= src[6*srcStride];\
1690         const int src7= src[7*srcStride];\
1691         const int src8= src[8*srcStride];\
1692         const int src9= src[9*srcStride];\
1693         const int src10= src[10*srcStride];\
1694         const int src11= src[11*srcStride];\
1695         const int src12= src[12*srcStride];\
1696         const int src13= src[13*srcStride];\
1697         const int src14= src[14*srcStride];\
1698         const int src15= src[15*srcStride];\
1699         const int src16= src[16*srcStride];\
1700         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1701         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1702         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1703         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1704         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1705         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1706         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1707         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1708         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1709         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1710         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1711         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1712         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1713         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1714         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1715         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1716         dst++;\
1717         src++;\
1718     }\
1719 }\
1720 \
1721 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1722     OPNAME ## pixels8_c(dst, src, stride, 8);\
1723 }\
1724 \
1725 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1726     uint8_t half[64];\
1727     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1728     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1729 }\
1730 \
1731 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1732     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1733 }\
1734 \
1735 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1736     uint8_t half[64];\
1737     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1738     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1739 }\
1740 \
1741 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1742     uint8_t full[16*9];\
1743     uint8_t half[64];\
1744     copy_block9(full, src, 16, stride, 9);\
1745     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1746     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1747 }\
1748 \
1749 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1750     uint8_t full[16*9];\
1751     copy_block9(full, src, 16, stride, 9);\
1752     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1753 }\
1754 \
1755 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1756     uint8_t full[16*9];\
1757     uint8_t half[64];\
1758     copy_block9(full, src, 16, stride, 9);\
1759     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1760     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1761 }\
1762 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1763     uint8_t full[16*9];\
1764     uint8_t halfH[72];\
1765     uint8_t halfV[64];\
1766     uint8_t halfHV[64];\
1767     copy_block9(full, src, 16, stride, 9);\
1768     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1769     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1770     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1771     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1772 }\
1773 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1774     uint8_t full[16*9];\
1775     uint8_t halfH[72];\
1776     uint8_t halfHV[64];\
1777     copy_block9(full, src, 16, stride, 9);\
1778     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1779     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1780     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1781     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1782 }\
1783 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1784     uint8_t full[16*9];\
1785     uint8_t halfH[72];\
1786     uint8_t halfV[64];\
1787     uint8_t halfHV[64];\
1788     copy_block9(full, src, 16, stride, 9);\
1789     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1790     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1791     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1792     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1793 }\
1794 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1795     uint8_t full[16*9];\
1796     uint8_t halfH[72];\
1797     uint8_t halfHV[64];\
1798     copy_block9(full, src, 16, stride, 9);\
1799     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1800     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1801     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1802     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1803 }\
1804 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1805     uint8_t full[16*9];\
1806     uint8_t halfH[72];\
1807     uint8_t halfV[64];\
1808     uint8_t halfHV[64];\
1809     copy_block9(full, src, 16, stride, 9);\
1810     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1811     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1812     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1813     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1814 }\
1815 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1816     uint8_t full[16*9];\
1817     uint8_t halfH[72];\
1818     uint8_t halfHV[64];\
1819     copy_block9(full, src, 16, stride, 9);\
1820     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1821     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1822     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1823     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1824 }\
1825 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1826     uint8_t full[16*9];\
1827     uint8_t halfH[72];\
1828     uint8_t halfV[64];\
1829     uint8_t halfHV[64];\
1830     copy_block9(full, src, 16, stride, 9);\
1831     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1832     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1833     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1834     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1835 }\
1836 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1837     uint8_t full[16*9];\
1838     uint8_t halfH[72];\
1839     uint8_t halfHV[64];\
1840     copy_block9(full, src, 16, stride, 9);\
1841     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1842     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1843     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1844     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1845 }\
1846 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1847     uint8_t halfH[72];\
1848     uint8_t halfHV[64];\
1849     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1850     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1851     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1852 }\
1853 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1854     uint8_t halfH[72];\
1855     uint8_t halfHV[64];\
1856     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1857     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1858     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1859 }\
1860 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1861     uint8_t full[16*9];\
1862     uint8_t halfH[72];\
1863     uint8_t halfV[64];\
1864     uint8_t halfHV[64];\
1865     copy_block9(full, src, 16, stride, 9);\
1866     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1867     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1868     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1869     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1870 }\
1871 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1872     uint8_t full[16*9];\
1873     uint8_t halfH[72];\
1874     copy_block9(full, src, 16, stride, 9);\
1875     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1876     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1877     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1878 }\
1879 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1880     uint8_t full[16*9];\
1881     uint8_t halfH[72];\
1882     uint8_t halfV[64];\
1883     uint8_t halfHV[64];\
1884     copy_block9(full, src, 16, stride, 9);\
1885     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1886     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1887     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1888     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1889 }\
1890 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1891     uint8_t full[16*9];\
1892     uint8_t halfH[72];\
1893     copy_block9(full, src, 16, stride, 9);\
1894     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1895     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1896     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1897 }\
1898 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1899     uint8_t halfH[72];\
1900     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1901     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1902 }\
1903 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1904     OPNAME ## pixels16_c(dst, src, stride, 16);\
1905 }\
1906 \
1907 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1908     uint8_t half[256];\
1909     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1910     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1911 }\
1912 \
1913 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1914     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1915 }\
1916 \
1917 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1918     uint8_t half[256];\
1919     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1920     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1921 }\
1922 \
1923 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1924     uint8_t full[24*17];\
1925     uint8_t half[256];\
1926     copy_block17(full, src, 24, stride, 17);\
1927     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1928     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1929 }\
1930 \
1931 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1932     uint8_t full[24*17];\
1933     copy_block17(full, src, 24, stride, 17);\
1934     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1935 }\
1936 \
1937 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1938     uint8_t full[24*17];\
1939     uint8_t half[256];\
1940     copy_block17(full, src, 24, stride, 17);\
1941     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1942     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1943 }\
1944 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1945     uint8_t full[24*17];\
1946     uint8_t halfH[272];\
1947     uint8_t halfV[256];\
1948     uint8_t halfHV[256];\
1949     copy_block17(full, src, 24, stride, 17);\
1950     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1951     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1952     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1953     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1954 }\
1955 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1956     uint8_t full[24*17];\
1957     uint8_t halfH[272];\
1958     uint8_t halfHV[256];\
1959     copy_block17(full, src, 24, stride, 17);\
1960     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1961     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1962     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1963     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1964 }\
1965 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1966     uint8_t full[24*17];\
1967     uint8_t halfH[272];\
1968     uint8_t halfV[256];\
1969     uint8_t halfHV[256];\
1970     copy_block17(full, src, 24, stride, 17);\
1971     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1972     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1973     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1974     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1975 }\
1976 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1977     uint8_t full[24*17];\
1978     uint8_t halfH[272];\
1979     uint8_t halfHV[256];\
1980     copy_block17(full, src, 24, stride, 17);\
1981     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1982     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1983     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1984     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1985 }\
1986 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1987     uint8_t full[24*17];\
1988     uint8_t halfH[272];\
1989     uint8_t halfV[256];\
1990     uint8_t halfHV[256];\
1991     copy_block17(full, src, 24, stride, 17);\
1992     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1993     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1994     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1995     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1996 }\
1997 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1998     uint8_t full[24*17];\
1999     uint8_t halfH[272];\
2000     uint8_t halfHV[256];\
2001     copy_block17(full, src, 24, stride, 17);\
2002     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2003     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2004     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2005     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2006 }\
2007 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2008     uint8_t full[24*17];\
2009     uint8_t halfH[272];\
2010     uint8_t halfV[256];\
2011     uint8_t halfHV[256];\
2012     copy_block17(full, src, 24, stride, 17);\
2013     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2014     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2015     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2016     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2017 }\
2018 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2019     uint8_t full[24*17];\
2020     uint8_t halfH[272];\
2021     uint8_t halfHV[256];\
2022     copy_block17(full, src, 24, stride, 17);\
2023     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2024     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2025     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2026     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2027 }\
2028 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2029     uint8_t halfH[272];\
2030     uint8_t halfHV[256];\
2031     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2032     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2033     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2034 }\
2035 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2036     uint8_t halfH[272];\
2037     uint8_t halfHV[256];\
2038     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2039     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2040     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2041 }\
2042 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2043     uint8_t full[24*17];\
2044     uint8_t halfH[272];\
2045     uint8_t halfV[256];\
2046     uint8_t halfHV[256];\
2047     copy_block17(full, src, 24, stride, 17);\
2048     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2049     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2050     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2051     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2052 }\
2053 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2054     uint8_t full[24*17];\
2055     uint8_t halfH[272];\
2056     copy_block17(full, src, 24, stride, 17);\
2057     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2058     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2059     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2060 }\
2061 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2062     uint8_t full[24*17];\
2063     uint8_t halfH[272];\
2064     uint8_t halfV[256];\
2065     uint8_t halfHV[256];\
2066     copy_block17(full, src, 24, stride, 17);\
2067     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2068     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2069     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2070     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2071 }\
2072 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2073     uint8_t full[24*17];\
2074     uint8_t halfH[272];\
2075     copy_block17(full, src, 24, stride, 17);\
2076     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2077     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2078     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2079 }\
2080 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2081     uint8_t halfH[272];\
2082     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2083     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2084 }
2085
2086 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2087 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2088 #define op_put(a, b) a = cm[((b) + 16)>>5]
2089 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2090
2091 QPEL_MC(0, put_       , _       , op_put)
2092 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2093 QPEL_MC(0, avg_       , _       , op_avg)
2094 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2095 #undef op_avg
2096 #undef op_avg_no_rnd
2097 #undef op_put
2098 #undef op_put_no_rnd
2099
2100 #if 1
2101 #define H264_LOWPASS(OPNAME, OP, OP2) \
2102 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2103     const int h=2;\
2104     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2105     int i;\
2106     for(i=0; i<h; i++)\
2107     {\
2108         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2109         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2110         dst+=dstStride;\
2111         src+=srcStride;\
2112     }\
2113 }\
2114 \
2115 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2116     const int w=2;\
2117     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2118     int i;\
2119     for(i=0; i<w; i++)\
2120     {\
2121         const int srcB= src[-2*srcStride];\
2122         const int srcA= src[-1*srcStride];\
2123         const int src0= src[0 *srcStride];\
2124         const int src1= src[1 *srcStride];\
2125         const int src2= src[2 *srcStride];\
2126         const int src3= src[3 *srcStride];\
2127         const int src4= src[4 *srcStride];\
2128         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2129         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2130         dst++;\
2131         src++;\
2132     }\
2133 }\
2134 \
2135 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2136     const int h=2;\
2137     const int w=2;\
2138     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2139     int i;\
2140     src -= 2*srcStride;\
2141     for(i=0; i<h+5; i++)\
2142     {\
2143         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2144         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2145         tmp+=tmpStride;\
2146         src+=srcStride;\
2147     }\
2148     tmp -= tmpStride*(h+5-2);\
2149     for(i=0; i<w; i++)\
2150     {\
2151         const int tmpB= tmp[-2*tmpStride];\
2152         const int tmpA= tmp[-1*tmpStride];\
2153         const int tmp0= tmp[0 *tmpStride];\
2154         const int tmp1= tmp[1 *tmpStride];\
2155         const int tmp2= tmp[2 *tmpStride];\
2156         const int tmp3= tmp[3 *tmpStride];\
2157         const int tmp4= tmp[4 *tmpStride];\
2158         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2159         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2160         dst++;\
2161         tmp++;\
2162     }\
2163 }\
2164 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2165     const int h=4;\
2166     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2167     int i;\
2168     for(i=0; i<h; i++)\
2169     {\
2170         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2171         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2172         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2173         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2174         dst+=dstStride;\
2175         src+=srcStride;\
2176     }\
2177 }\
2178 \
2179 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2180     const int w=4;\
2181     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2182     int i;\
2183     for(i=0; i<w; i++)\
2184     {\
2185         const int srcB= src[-2*srcStride];\
2186         const int srcA= src[-1*srcStride];\
2187         const int src0= src[0 *srcStride];\
2188         const int src1= src[1 *srcStride];\
2189         const int src2= src[2 *srcStride];\
2190         const int src3= src[3 *srcStride];\
2191         const int src4= src[4 *srcStride];\
2192         const int src5= src[5 *srcStride];\
2193         const int src6= src[6 *srcStride];\
2194         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2195         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2196         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2197         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2198         dst++;\
2199         src++;\
2200     }\
2201 }\
2202 \
2203 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2204     const int h=4;\
2205     const int w=4;\
2206     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2207     int i;\
2208     src -= 2*srcStride;\
2209     for(i=0; i<h+5; i++)\
2210     {\
2211         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2212         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2213         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2214         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2215         tmp+=tmpStride;\
2216         src+=srcStride;\
2217     }\
2218     tmp -= tmpStride*(h+5-2);\
2219     for(i=0; i<w; i++)\
2220     {\
2221         const int tmpB= tmp[-2*tmpStride];\
2222         const int tmpA= tmp[-1*tmpStride];\
2223         const int tmp0= tmp[0 *tmpStride];\
2224         const int tmp1= tmp[1 *tmpStride];\
2225         const int tmp2= tmp[2 *tmpStride];\
2226         const int tmp3= tmp[3 *tmpStride];\
2227         const int tmp4= tmp[4 *tmpStride];\
2228         const int tmp5= tmp[5 *tmpStride];\
2229         const int tmp6= tmp[6 *tmpStride];\
2230         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2231         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2232         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2233         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2234         dst++;\
2235         tmp++;\
2236     }\
2237 }\
2238 \
2239 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2240     const int h=8;\
2241     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2242     int i;\
2243     for(i=0; i<h; i++)\
2244     {\
2245         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2246         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2247         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2248         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2249         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2250         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2251         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2252         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2253         dst+=dstStride;\
2254         src+=srcStride;\
2255     }\
2256 }\
2257 \
2258 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2259     const int w=8;\
2260     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2261     int i;\
2262     for(i=0; i<w; i++)\
2263     {\
2264         const int srcB= src[-2*srcStride];\
2265         const int srcA= src[-1*srcStride];\
2266         const int src0= src[0 *srcStride];\
2267         const int src1= src[1 *srcStride];\
2268         const int src2= src[2 *srcStride];\
2269         const int src3= src[3 *srcStride];\
2270         const int src4= src[4 *srcStride];\
2271         const int src5= src[5 *srcStride];\
2272         const int src6= src[6 *srcStride];\
2273         const int src7= src[7 *srcStride];\
2274         const int src8= src[8 *srcStride];\
2275         const int src9= src[9 *srcStride];\
2276         const int src10=src[10*srcStride];\
2277         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2278         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2279         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2280         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2281         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2282         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2283         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2284         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2285         dst++;\
2286         src++;\
2287     }\
2288 }\
2289 \
2290 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2291     const int h=8;\
2292     const int w=8;\
2293     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2294     int i;\
2295     src -= 2*srcStride;\
2296     for(i=0; i<h+5; i++)\
2297     {\
2298         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2299         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2300         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2301         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2302         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2303         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2304         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2305         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2306         tmp+=tmpStride;\
2307         src+=srcStride;\
2308     }\
2309     tmp -= tmpStride*(h+5-2);\
2310     for(i=0; i<w; i++)\
2311     {\
2312         const int tmpB= tmp[-2*tmpStride];\
2313         const int tmpA= tmp[-1*tmpStride];\
2314         const int tmp0= tmp[0 *tmpStride];\
2315         const int tmp1= tmp[1 *tmpStride];\
2316         const int tmp2= tmp[2 *tmpStride];\
2317         const int tmp3= tmp[3 *tmpStride];\
2318         const int tmp4= tmp[4 *tmpStride];\
2319         const int tmp5= tmp[5 *tmpStride];\
2320         const int tmp6= tmp[6 *tmpStride];\
2321         const int tmp7= tmp[7 *tmpStride];\
2322         const int tmp8= tmp[8 *tmpStride];\
2323         const int tmp9= tmp[9 *tmpStride];\
2324         const int tmp10=tmp[10*tmpStride];\
2325         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2326         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2327         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2328         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2329         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2330         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2331         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2332         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2333         dst++;\
2334         tmp++;\
2335     }\
2336 }\
2337 \
2338 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2339     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2340     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2341     src += 8*srcStride;\
2342     dst += 8*dstStride;\
2343     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2344     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2345 }\
2346 \
2347 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2348     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2349     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2350     src += 8*srcStride;\
2351     dst += 8*dstStride;\
2352     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2353     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2354 }\
2355 \
2356 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2357     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2358     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2359     src += 8*srcStride;\
2360     dst += 8*dstStride;\
2361     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2362     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2363 }\
2364
2365 #define H264_MC(OPNAME, SIZE) \
2366 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2367     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2368 }\
2369 \
2370 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2371     uint8_t half[SIZE*SIZE];\
2372     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2373     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2374 }\
2375 \
2376 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2377     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2378 }\
2379 \
2380 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2381     uint8_t half[SIZE*SIZE];\
2382     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2383     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2384 }\
2385 \
2386 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2387     uint8_t full[SIZE*(SIZE+5)];\
2388     uint8_t * const full_mid= full + SIZE*2;\
2389     uint8_t half[SIZE*SIZE];\
2390     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2391     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2392     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2393 }\
2394 \
2395 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2396     uint8_t full[SIZE*(SIZE+5)];\
2397     uint8_t * const full_mid= full + SIZE*2;\
2398     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2399     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2400 }\
2401 \
2402 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2403     uint8_t full[SIZE*(SIZE+5)];\
2404     uint8_t * const full_mid= full + SIZE*2;\
2405     uint8_t half[SIZE*SIZE];\
2406     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2407     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2408     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2409 }\
2410 \
2411 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2412     uint8_t full[SIZE*(SIZE+5)];\
2413     uint8_t * const full_mid= full + SIZE*2;\
2414     uint8_t halfH[SIZE*SIZE];\
2415     uint8_t halfV[SIZE*SIZE];\
2416     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2417     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2418     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2419     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2420 }\
2421 \
2422 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2423     uint8_t full[SIZE*(SIZE+5)];\
2424     uint8_t * const full_mid= full + SIZE*2;\
2425     uint8_t halfH[SIZE*SIZE];\
2426     uint8_t halfV[SIZE*SIZE];\
2427     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2428     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2429     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2430     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2431 }\
2432 \
2433 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2434     uint8_t full[SIZE*(SIZE+5)];\
2435     uint8_t * const full_mid= full + SIZE*2;\
2436     uint8_t halfH[SIZE*SIZE];\
2437     uint8_t halfV[SIZE*SIZE];\
2438     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2439     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2440     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2441     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2442 }\
2443 \
2444 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2445     uint8_t full[SIZE*(SIZE+5)];\
2446     uint8_t * const full_mid= full + SIZE*2;\
2447     uint8_t halfH[SIZE*SIZE];\
2448     uint8_t halfV[SIZE*SIZE];\
2449     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2450     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2451     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2452     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2453 }\
2454 \
2455 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2456     int16_t tmp[SIZE*(SIZE+5)];\
2457     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2458 }\
2459 \
2460 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2461     int16_t tmp[SIZE*(SIZE+5)];\
2462     uint8_t halfH[SIZE*SIZE];\
2463     uint8_t halfHV[SIZE*SIZE];\
2464     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2465     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2466     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2467 }\
2468 \
2469 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2470     int16_t tmp[SIZE*(SIZE+5)];\
2471     uint8_t halfH[SIZE*SIZE];\
2472     uint8_t halfHV[SIZE*SIZE];\
2473     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2474     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2475     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2476 }\
2477 \
2478 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2479     uint8_t full[SIZE*(SIZE+5)];\
2480     uint8_t * const full_mid= full + SIZE*2;\
2481     int16_t tmp[SIZE*(SIZE+5)];\
2482     uint8_t halfV[SIZE*SIZE];\
2483     uint8_t halfHV[SIZE*SIZE];\
2484     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2485     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2486     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2487     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2488 }\
2489 \
2490 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2491     uint8_t full[SIZE*(SIZE+5)];\
2492     uint8_t * const full_mid= full + SIZE*2;\
2493     int16_t tmp[SIZE*(SIZE+5)];\
2494     uint8_t halfV[SIZE*SIZE];\
2495     uint8_t halfHV[SIZE*SIZE];\
2496     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2497     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2498     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2499     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2500 }\
2501
2502 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2503 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2504 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2505 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2506 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2507
2508 H264_LOWPASS(put_       , op_put, op2_put)
2509 H264_LOWPASS(avg_       , op_avg, op2_avg)
2510 H264_MC(put_, 2)
2511 H264_MC(put_, 4)
2512 H264_MC(put_, 8)
2513 H264_MC(put_, 16)
2514 H264_MC(avg_, 4)
2515 H264_MC(avg_, 8)
2516 H264_MC(avg_, 16)
2517
2518 #undef op_avg
2519 #undef op_put
2520 #undef op2_avg
2521 #undef op2_put
2522 #endif
2523
2524 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2525 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2526 #define H264_WEIGHT(W,H) \
2527 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2528     int y; \
2529     offset <<= log2_denom; \
2530     if(log2_denom) offset += 1<<(log2_denom-1); \
2531     for(y=0; y<H; y++, block += stride){ \
2532         op_scale1(0); \
2533         op_scale1(1); \
2534         if(W==2) continue; \
2535         op_scale1(2); \
2536         op_scale1(3); \
2537         if(W==4) continue; \
2538         op_scale1(4); \
2539         op_scale1(5); \
2540         op_scale1(6); \
2541         op_scale1(7); \
2542         if(W==8) continue; \
2543         op_scale1(8); \
2544         op_scale1(9); \
2545         op_scale1(10); \
2546         op_scale1(11); \
2547         op_scale1(12); \
2548         op_scale1(13); \
2549         op_scale1(14); \
2550         op_scale1(15); \
2551     } \
2552 } \
2553 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2554     int y; \
2555     offset = ((offset + 1) | 1) << log2_denom; \
2556     for(y=0; y<H; y++, dst += stride, src += stride){ \
2557         op_scale2(0); \
2558         op_scale2(1); \
2559         if(W==2) continue; \
2560         op_scale2(2); \
2561         op_scale2(3); \
2562         if(W==4) continue; \
2563         op_scale2(4); \
2564         op_scale2(5); \
2565         op_scale2(6); \
2566         op_scale2(7); \
2567         if(W==8) continue; \
2568         op_scale2(8); \
2569         op_scale2(9); \
2570         op_scale2(10); \
2571         op_scale2(11); \
2572         op_scale2(12); \
2573         op_scale2(13); \
2574         op_scale2(14); \
2575         op_scale2(15); \
2576     } \
2577 }
2578
2579 H264_WEIGHT(16,16)
2580 H264_WEIGHT(16,8)
2581 H264_WEIGHT(8,16)
2582 H264_WEIGHT(8,8)
2583 H264_WEIGHT(8,4)
2584 H264_WEIGHT(4,8)
2585 H264_WEIGHT(4,4)
2586 H264_WEIGHT(4,2)
2587 H264_WEIGHT(2,4)
2588 H264_WEIGHT(2,2)
2589
2590 #undef op_scale1
2591 #undef op_scale2
2592 #undef H264_WEIGHT
2593
2594 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2595     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2596     int i;
2597
2598     for(i=0; i<h; i++){
2599         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2600         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2601         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2602         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2603         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2604         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2605         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2606         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2607         dst+=dstStride;
2608         src+=srcStride;
2609     }
2610 }
2611
2612 #ifdef CONFIG_CAVS_DECODER
2613 /* AVS specific */
2614 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2615
2616 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2617     put_pixels8_c(dst, src, stride, 8);
2618 }
2619 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2620     avg_pixels8_c(dst, src, stride, 8);
2621 }
2622 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2623     put_pixels16_c(dst, src, stride, 16);
2624 }
2625 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2626     avg_pixels16_c(dst, src, stride, 16);
2627 }
2628 #endif /* CONFIG_CAVS_DECODER */
2629
2630 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2631 /* VC-1 specific */
2632 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2633
2634 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2635     put_pixels8_c(dst, src, stride, 8);
2636 }
2637 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2638
2639 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2640
2641 /* H264 specific */
2642 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2643
2644 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2645     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2646     int i;
2647
2648     for(i=0; i<w; i++){
2649         const int src_1= src[ -srcStride];
2650         const int src0 = src[0          ];
2651         const int src1 = src[  srcStride];
2652         const int src2 = src[2*srcStride];
2653         const int src3 = src[3*srcStride];
2654         const int src4 = src[4*srcStride];
2655         const int src5 = src[5*srcStride];
2656         const int src6 = src[6*srcStride];
2657         const int src7 = src[7*srcStride];
2658         const int src8 = src[8*srcStride];
2659         const int src9 = src[9*srcStride];
2660         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2661         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2662         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2663         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2664         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2665         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2666         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2667         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2668         src++;
2669         dst++;
2670     }
2671 }
2672
2673 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2674     put_pixels8_c(dst, src, stride, 8);
2675 }
2676
2677 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2678     uint8_t half[64];
2679     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2680     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2681 }
2682
2683 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2684     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2685 }
2686
2687 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2688     uint8_t half[64];
2689     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2690     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2691 }
2692
2693 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2694     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2695 }
2696
2697 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2698     uint8_t halfH[88];
2699     uint8_t halfV[64];
2700     uint8_t halfHV[64];
2701     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2702     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2703     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2704     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2705 }
2706 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2707     uint8_t halfH[88];
2708     uint8_t halfV[64];
2709     uint8_t halfHV[64];
2710     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2711     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2712     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2713     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2714 }
2715 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2716     uint8_t halfH[88];
2717     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2718     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2719 }
2720
2721 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2722     if(ENABLE_ANY_H263) {
2723     int x;
2724     const int strength= ff_h263_loop_filter_strength[qscale];
2725
2726     for(x=0; x<8; x++){
2727         int d1, d2, ad1;
2728         int p0= src[x-2*stride];
2729         int p1= src[x-1*stride];
2730         int p2= src[x+0*stride];
2731         int p3= src[x+1*stride];
2732         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2733
2734         if     (d<-2*strength) d1= 0;
2735         else if(d<-  strength) d1=-2*strength - d;
2736         else if(d<   strength) d1= d;
2737         else if(d< 2*strength) d1= 2*strength - d;
2738         else                   d1= 0;
2739
2740         p1 += d1;
2741         p2 -= d1;
2742         if(p1&256) p1= ~(p1>>31);
2743         if(p2&256) p2= ~(p2>>31);
2744
2745         src[x-1*stride] = p1;
2746         src[x+0*stride] = p2;
2747
2748         ad1= FFABS(d1)>>1;
2749
2750         d2= av_clip((p0-p3)/4, -ad1, ad1);
2751
2752         src[x-2*stride] = p0 - d2;
2753         src[x+  stride] = p3 + d2;
2754     }
2755     }
2756 }
2757
2758 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2759     if(ENABLE_ANY_H263) {
2760     int y;
2761     const int strength= ff_h263_loop_filter_strength[qscale];
2762
2763     for(y=0; y<8; y++){
2764         int d1, d2, ad1;
2765         int p0= src[y*stride-2];
2766         int p1= src[y*stride-1];
2767         int p2= src[y*stride+0];
2768         int p3= src[y*stride+1];
2769         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2770
2771         if     (d<-2*strength) d1= 0;
2772         else if(d<-  strength) d1=-2*strength - d;
2773         else if(d<   strength) d1= d;
2774         else if(d< 2*strength) d1= 2*strength - d;
2775         else                   d1= 0;
2776
2777         p1 += d1;
2778         p2 -= d1;
2779         if(p1&256) p1= ~(p1>>31);
2780         if(p2&256) p2= ~(p2>>31);
2781
2782         src[y*stride-1] = p1;
2783         src[y*stride+0] = p2;
2784
2785         ad1= FFABS(d1)>>1;
2786
2787         d2= av_clip((p0-p3)/4, -ad1, ad1);
2788
2789         src[y*stride-2] = p0 - d2;
2790         src[y*stride+1] = p3 + d2;
2791     }
2792     }
2793 }
2794
2795 static void h261_loop_filter_c(uint8_t *src, int stride){
2796     int x,y,xy,yz;
2797     int temp[64];
2798
2799     for(x=0; x<8; x++){
2800         temp[x      ] = 4*src[x           ];
2801         temp[x + 7*8] = 4*src[x + 7*stride];
2802     }
2803     for(y=1; y<7; y++){
2804         for(x=0; x<8; x++){
2805             xy = y * stride + x;
2806             yz = y * 8 + x;
2807             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2808         }
2809     }
2810
2811     for(y=0; y<8; y++){
2812         src[  y*stride] = (temp[  y*8] + 2)>>2;
2813         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2814         for(x=1; x<7; x++){
2815             xy = y * stride + x;
2816             yz = y * 8 + x;
2817             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2818         }
2819     }
2820 }
2821
2822 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2823 {
2824     int i, d;
2825     for( i = 0; i < 4; i++ ) {
2826         if( tc0[i] < 0 ) {
2827             pix += 4*ystride;
2828             continue;
2829         }
2830         for( d = 0; d < 4; d++ ) {
2831             const int p0 = pix[-1*xstride];
2832             const int p1 = pix[-2*xstride];
2833             const int p2 = pix[-3*xstride];
2834             const int q0 = pix[0];
2835             const int q1 = pix[1*xstride];
2836             const int q2 = pix[2*xstride];
2837
2838             if( FFABS( p0 - q0 ) < alpha &&
2839                 FFABS( p1 - p0 ) < beta &&
2840                 FFABS( q1 - q0 ) < beta ) {
2841
2842                 int tc = tc0[i];
2843                 int i_delta;
2844
2845                 if( FFABS( p2 - p0 ) < beta ) {
2846                     pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2847                     tc++;
2848                 }
2849                 if( FFABS( q2 - q0 ) < beta ) {
2850                     pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2851                     tc++;
2852                 }
2853
2854                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2855                 pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2856                 pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2857             }
2858             pix += ystride;
2859         }
2860     }
2861 }
2862 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2863 {
2864     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2865 }
2866 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2867 {
2868     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2869 }
2870
2871 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2872 {
2873     int i, d;
2874     for( i = 0; i < 4; i++ ) {
2875         const int tc = tc0[i];
2876         if( tc <= 0 ) {
2877             pix += 2*ystride;
2878             continue;
2879         }
2880         for( d = 0; d < 2; d++ ) {
2881             const int p0 = pix[-1*xstride];
2882             const int p1 = pix[-2*xstride];
2883             const int q0 = pix[0];
2884             const int q1 = pix[1*xstride];
2885
2886             if( FFABS( p0 - q0 ) < alpha &&
2887                 FFABS( p1 - p0 ) < beta &&
2888                 FFABS( q1 - q0 ) < beta ) {
2889
2890                 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2891
2892                 pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2893                 pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2894             }
2895             pix += ystride;
2896         }
2897     }
2898 }
2899 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2900 {
2901     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2902 }
2903 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2904 {
2905     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2906 }
2907
2908 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2909 {
2910     int d;
2911     for( d = 0; d < 8; d++ ) {
2912         const int p0 = pix[-1*xstride];
2913         const int p1 = pix[-2*xstride];
2914         const int q0 = pix[0];
2915         const int q1 = pix[1*xstride];
2916
2917         if( FFABS( p0 - q0 ) < alpha &&
2918             FFABS( p1 - p0 ) < beta &&
2919             FFABS( q1 - q0 ) < beta ) {
2920
2921             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2922             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2923         }
2924         pix += ystride;
2925     }
2926 }
2927 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2928 {
2929     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2930 }
2931 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2932 {
2933     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2934 }
2935
2936 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2937 {
2938     int s, i;
2939
2940     s = 0;
2941     for(i=0;i<h;i++) {
2942         s += abs(pix1[0] - pix2[0]);
2943         s += abs(pix1[1] - pix2[1]);
2944         s += abs(pix1[2] - pix2[2]);
2945         s += abs(pix1[3] - pix2[3]);
2946         s += abs(pix1[4] - pix2[4]);
2947         s += abs(pix1[5] - pix2[5]);
2948         s += abs(pix1[6] - pix2[6]);
2949         s += abs(pix1[7] - pix2[7]);
2950         s += abs(pix1[8] - pix2[8]);
2951         s += abs(pix1[9] - pix2[9]);
2952         s += abs(pix1[10] - pix2[10]);
2953         s += abs(pix1[11] - pix2[11]);
2954         s += abs(pix1[12] - pix2[12]);
2955         s += abs(pix1[13] - pix2[13]);
2956         s += abs(pix1[14] - pix2[14]);
2957         s += abs(pix1[15] - pix2[15]);
2958         pix1 += line_size;
2959         pix2 += line_size;
2960     }
2961     return s;
2962 }
2963
2964 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2965 {
2966     int s, i;
2967
2968     s = 0;
2969     for(i=0;i<h;i++) {
2970         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2971         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2972         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2973         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2974         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2975         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2976         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2977         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2978         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2979         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2980         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2981         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2982         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2983         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2984         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2985         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2986         pix1 += line_size;
2987         pix2 += line_size;
2988     }
2989     return s;
2990 }
2991
2992 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2993 {
2994     int s, i;
2995     uint8_t *pix3 = pix2 + line_size;
2996
2997     s = 0;
2998     for(i=0;i<h;i++) {
2999         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3000         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3001         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3002         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3003         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3004         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3005         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3006         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3007         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3008         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3009         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3010         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3011         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3012         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3013         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3014         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3015         pix1 += line_size;
3016         pix2 += line_size;
3017         pix3 += line_size;
3018     }
3019     return s;
3020 }
3021
3022 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3023 {
3024     int s, i;
3025     uint8_t *pix3 = pix2 + line_size;
3026
3027     s = 0;
3028     for(i=0;i<h;i++) {
3029         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3030         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3031         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3032         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3033         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3034         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3035         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3036         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3037         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3038         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3039         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3040         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3041         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3042         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3043         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3044         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3045         pix1 += line_size;
3046         pix2 += line_size;
3047         pix3 += line_size;
3048     }
3049     return s;
3050 }
3051
3052 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3053 {
3054     int s, i;
3055
3056     s = 0;
3057     for(i=0;i<h;i++) {
3058         s += abs(pix1[0] - pix2[0]);
3059         s += abs(pix1[1] - pix2[1]);
3060         s += abs(pix1[2] - pix2[2]);
3061         s += abs(pix1[3] - pix2[3]);
3062         s += abs(pix1[4] - pix2[4]);
3063         s += abs(pix1[5] - pix2[5]);
3064         s += abs(pix1[6] - pix2[6]);
3065         s += abs(pix1[7] - pix2[7]);
3066         pix1 += line_size;
3067         pix2 += line_size;
3068     }
3069     return s;
3070 }
3071
3072 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3073 {
3074     int s, i;
3075
3076     s = 0;
3077     for(i=0;i<h;i++) {
3078         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3079         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3080         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3081         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3082         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3083         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3084         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3085         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3086         pix1 += line_size;
3087         pix2 += line_size;
3088     }
3089     return s;
3090 }
3091
3092 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3093 {
3094     int s, i;
3095     uint8_t *pix3 = pix2 + line_size;
3096
3097     s = 0;
3098     for(i=0;i<h;i++) {
3099         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3100         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3101         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3102         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3103         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3104         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3105         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3106         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3107         pix1 += line_size;
3108         pix2 += line_size;
3109         pix3 += line_size;
3110     }
3111     return s;
3112 }
3113
3114 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3115 {
3116     int s, i;
3117     uint8_t *pix3 = pix2 + line_size;
3118
3119     s = 0;
3120     for(i=0;i<h;i++) {
3121         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3122         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3123         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3124         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3125         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3126         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3127         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3128         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3129         pix1 += line_size;
3130         pix2 += line_size;
3131         pix3 += line_size;
3132     }
3133     return s;
3134 }
3135
3136 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3137     MpegEncContext *c = v;
3138     int score1=0;
3139     int score2=0;
3140     int x,y;
3141
3142     for(y=0; y<h; y++){
3143         for(x=0; x<16; x++){
3144             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3145         }
3146         if(y+1<h){
3147             for(x=0; x<15; x++){
3148                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3149                              - s1[x+1] + s1[x+1+stride])
3150                         -FFABS(  s2[x  ] - s2[x  +stride]
3151                              - s2[x+1] + s2[x+1+stride]);
3152             }
3153         }
3154         s1+= stride;
3155         s2+= stride;
3156     }
3157
3158     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3159     else  return score1 + FFABS(score2)*8;
3160 }
3161
3162 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3163     MpegEncContext *c = v;
3164     int score1=0;
3165     int score2=0;
3166     int x,y;
3167
3168     for(y=0; y<h; y++){
3169         for(x=0; x<8; x++){
3170             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3171         }
3172         if(y+1<h){
3173             for(x=0; x<7; x++){
3174                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3175                              - s1[x+1] + s1[x+1+stride])
3176                         -FFABS(  s2[x  ] - s2[x  +stride]
3177                              - s2[x+1] + s2[x+1+stride]);
3178             }
3179         }
3180         s1+= stride;
3181         s2+= stride;
3182     }
3183
3184     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3185     else  return score1 + FFABS(score2)*8;
3186 }
3187
3188 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3189     int i;
3190     unsigned int sum=0;
3191
3192     for(i=0; i<8*8; i++){
3193         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3194         int w= weight[i];
3195         b>>= RECON_SHIFT;
3196         assert(-512<b && b<512);
3197
3198         sum += (w*b)*(w*b)>>4;
3199     }
3200     return sum>>2;
3201 }
3202
3203 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3204     int i;
3205
3206     for(i=0; i<8*8; i++){
3207         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3208     }
3209 }
3210
3211 /**
3212  * permutes an 8x8 block.
3213  * @param block the block which will be permuted according to the given permutation vector
3214  * @param permutation the permutation vector
3215  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3216  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3217  *                  (inverse) permutated to scantable order!
3218  */
3219 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3220 {
3221     int i;
3222     DCTELEM temp[64];
3223
3224     if(last<=0) return;
3225     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3226
3227     for(i=0; i<=last; i++){
3228         const int j= scantable[i];
3229         temp[j]= block[j];
3230         block[j]=0;
3231     }
3232
3233     for(i=0; i<=last; i++){
3234         const int j= scantable[i];
3235         const int perm_j= permutation[j];
3236         block[perm_j]= temp[j];
3237     }
3238 }
3239
3240 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3241     return 0;
3242 }
3243
3244 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3245     int i;
3246
3247     memset(cmp, 0, sizeof(void*)*5);
3248
3249     for(i=0; i<5; i++){
3250         switch(type&0xFF){
3251         case FF_CMP_SAD:
3252             cmp[i]= c->sad[i];
3253             break;
3254         case FF_CMP_SATD:
3255             cmp[i]= c->hadamard8_diff[i];
3256             break;
3257         case FF_CMP_SSE:
3258             cmp[i]= c->sse[i];
3259             break;
3260         case FF_CMP_DCT:
3261             cmp[i]= c->dct_sad[i];
3262             break;
3263         case FF_CMP_DCT264:
3264             cmp[i]= c->dct264_sad[i];
3265             break;
3266         case FF_CMP_DCTMAX:
3267             cmp[i]= c->dct_max[i];
3268             break;
3269         case FF_CMP_PSNR:
3270             cmp[i]= c->quant_psnr[i];
3271             break;
3272         case FF_CMP_BIT:
3273             cmp[i]= c->bit[i];
3274             break;
3275         case FF_CMP_RD:
3276             cmp[i]= c->rd[i];
3277             break;
3278         case FF_CMP_VSAD:
3279             cmp[i]= c->vsad[i];
3280             break;
3281         case FF_CMP_VSSE:
3282             cmp[i]= c->vsse[i];
3283             break;
3284         case FF_CMP_ZERO:
3285             cmp[i]= zero_cmp;
3286             break;
3287         case FF_CMP_NSSE:
3288             cmp[i]= c->nsse[i];
3289             break;
3290 #ifdef CONFIG_SNOW_ENCODER
3291         case FF_CMP_W53:
3292             cmp[i]= c->w53[i];
3293             break;
3294         case FF_CMP_W97:
3295             cmp[i]= c->w97[i];
3296             break;
3297 #endif
3298         default:
3299             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3300         }
3301     }
3302 }
3303
3304 /**
3305  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3306  */
3307 static void clear_blocks_c(DCTELEM *blocks)
3308 {
3309     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3310 }
3311
3312 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3313     long i;
3314     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3315         long a = *(long*)(src+i);
3316         long b = *(long*)(dst+i);
3317         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3318     }
3319     for(; i<w; i++)
3320         dst[i+0] += src[i+0];
3321 }
3322
3323 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3324     long i;
3325     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3326         long a = *(long*)(src1+i);
3327         long b = *(long*)(src2+i);
3328         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3329     }
3330     for(; i<w; i++)
3331         dst[i] = src1[i]+src2[i];
3332 }
3333
3334 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3335     long i;
3336 #ifndef HAVE_FAST_UNALIGNED
3337     if((long)src2 & (sizeof(long)-1)){
3338         for(i=0; i+7<w; i+=8){
3339             dst[i+0] = src1[i+0]-src2[i+0];
3340             dst[i+1] = src1[i+1]-src2[i+1];
3341             dst[i+2] = src1[i+2]-src2[i+2];
3342             dst[i+3] = src1[i+3]-src2[i+3];
3343             dst[i+4] = src1[i+4]-src2[i+4];
3344             dst[i+5] = src1[i+5]-src2[i+5];
3345             dst[i+6] = src1[i+6]-src2[i+6];
3346             dst[i+7] = src1[i+7]-src2[i+7];
3347         }
3348     }else
3349 #endif
3350     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3351         long a = *(long*)(src1+i);
3352         long b = *(long*)(src2+i);
3353         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3354     }
3355     for(; i<w; i++)
3356         dst[i+0] = src1[i+0]-src2[i+0];
3357 }
3358
3359 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3360     int i;
3361     uint8_t l, lt;
3362
3363     l= *left;
3364     lt= *left_top;
3365
3366     for(i=0; i<w; i++){
3367         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3368         lt= src1[i];
3369         l= src2[i];
3370         dst[i]= l - pred;
3371     }
3372
3373     *left= l;
3374     *left_top= lt;
3375 }
3376
3377 #define BUTTERFLY2(o1,o2,i1,i2) \
3378 o1= (i1)+(i2);\
3379 o2= (i1)-(i2);
3380
3381 #define BUTTERFLY1(x,y) \
3382 {\
3383     int a,b;\
3384     a= x;\
3385     b= y;\
3386     x= a+b;\
3387     y= a-b;\
3388 }
3389
3390 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3391
3392 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3393     int i;
3394     int temp[64];
3395     int sum=0;
3396
3397     assert(h==8);
3398
3399     for(i=0; i<8; i++){
3400         //FIXME try pointer walks
3401         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3402         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3403         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3404         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3405
3406         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3407         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3408         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3409         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3410
3411         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3412         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3413         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3414         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3415     }
3416
3417     for(i=0; i<8; i++){
3418         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3419         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3420         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3421         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3422
3423         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3424         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3425         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3426         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3427
3428         sum +=
3429              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3430             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3431             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3432             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3433     }
3434 #if 0
3435 static int maxi=0;
3436 if(sum>maxi){
3437     maxi=sum;
3438     printf("MAX:%d\n", maxi);
3439 }
3440 #endif
3441     return sum;
3442 }
3443
3444 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3445     int i;
3446     int temp[64];
3447     int sum=0;
3448
3449     assert(h==8);
3450
3451     for(i=0; i<8; i++){
3452         //FIXME try pointer walks
3453         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3454         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3455         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3456         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3457
3458         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3459         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3460         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3461         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3462
3463         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3464         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3465         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3466         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3467     }
3468
3469     for(i=0; i<8; i++){
3470         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3471         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3472         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3473         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3474
3475         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3476         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3477         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3478         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3479
3480         sum +=
3481              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3482             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3483             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3484             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3485     }
3486
3487     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3488
3489     return sum;
3490 }
3491
3492 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3493     MpegEncContext * const s= (MpegEncContext *)c;
3494     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3495     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3496
3497     assert(h==8);
3498
3499     s->dsp.diff_pixels(temp, src1, src2, stride);
3500     s->dsp.fdct(temp);
3501     return s->dsp.sum_abs_dctelem(temp);
3502 }
3503
3504 #ifdef CONFIG_GPL
3505 #define DCT8_1D {\
3506     const int s07 = SRC(0) + SRC(7);\
3507     const int s16 = SRC(1) + SRC(6);\
3508     const int s25 = SRC(2) + SRC(5);\
3509     const int s34 = SRC(3) + SRC(4);\
3510     const int a0 = s07 + s34;\
3511     const int a1 = s16 + s25;\
3512     const int a2 = s07 - s34;\
3513     const int a3 = s16 - s25;\
3514     const int d07 = SRC(0) - SRC(7);\
3515     const int d16 = SRC(1) - SRC(6);\
3516     const int d25 = SRC(2) - SRC(5);\
3517     const int d34 = SRC(3) - SRC(4);\
3518     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3519     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3520     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3521     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3522     DST(0,  a0 + a1     ) ;\
3523     DST(1,  a4 + (a7>>2)) ;\
3524     DST(2,  a2 + (a3>>1)) ;\
3525     DST(3,  a5 + (a6>>2)) ;\
3526     DST(4,  a0 - a1     ) ;\
3527     DST(5,  a6 - (a5>>2)) ;\
3528     DST(6, (a2>>1) - a3 ) ;\
3529     DST(7, (a4>>2) - a7 ) ;\
3530 }
3531
3532 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3533     MpegEncContext * const s= (MpegEncContext *)c;
3534     DCTELEM dct[8][8];
3535     int i;
3536     int sum=0;
3537
3538     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3539
3540 #define SRC(x) dct[i][x]
3541 #define DST(x,v) dct[i][x]= v
3542     for( i = 0; i < 8; i++ )
3543         DCT8_1D
3544 #undef SRC
3545 #undef DST
3546
3547 #define SRC(x) dct[x][i]
3548 #define DST(x,v) sum += FFABS(v)
3549     for( i = 0; i < 8; i++ )
3550         DCT8_1D
3551 #undef SRC
3552 #undef DST
3553     return sum;
3554 }
3555 #endif
3556
3557 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3558     MpegEncContext * const s= (MpegEncContext *)c;
3559     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3560     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3561     int sum=0, i;
3562
3563     assert(h==8);
3564
3565     s->dsp.diff_pixels(temp, src1, src2, stride);
3566     s->dsp.fdct(temp);
3567
3568     for(i=0; i<64; i++)
3569         sum= FFMAX(sum, FFABS(temp[i]));
3570
3571     return sum;
3572 }
3573
3574 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3575     MpegEncContext * const s= (MpegEncContext *)c;
3576     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3577     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3578     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3579     int sum=0, i;
3580
3581     assert(h==8);
3582     s->mb_intra=0;
3583
3584     s->dsp.diff_pixels(temp, src1, src2, stride);
3585
3586     memcpy(bak, temp, 64*sizeof(DCTELEM));
3587
3588     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3589     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3590     ff_simple_idct(temp); //FIXME
3591
3592     for(i=0; i<64; i++)
3593         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3594
3595     return sum;
3596 }
3597
3598 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3599     MpegEncContext * const s= (MpegEncContext *)c;
3600     const uint8_t *scantable= s->intra_scantable.permutated;
3601     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3602     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3603     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3604     uint8_t * const bak= (uint8_t*)aligned_bak;
3605     int i, last, run, bits, level, distoration, start_i;
3606     const int esc_length= s->ac_esc_length;
3607     uint8_t * length;
3608     uint8_t * last_length;
3609
3610     assert(h==8);
3611
3612     for(i=0; i<8; i++){
3613         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3614         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3615     }
3616
3617     s->dsp.diff_pixels(temp, src1, src2, stride);
3618
3619     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3620
3621     bits=0;
3622
3623     if (s->mb_intra) {
3624         start_i = 1;
3625         length     = s->intra_ac_vlc_length;
3626         last_length= s->intra_ac_vlc_last_length;
3627         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3628     } else {
3629         start_i = 0;
3630         length     = s->inter_ac_vlc_length;
3631         last_length= s->inter_ac_vlc_last_length;
3632     }
3633
3634     if(last>=start_i){
3635         run=0;
3636         for(i=start_i; i<last; i++){
3637             int j= scantable[i];
3638             level= temp[j];
3639
3640             if(level){
3641                 level+=64;
3642                 if((level&(~127)) == 0){
3643                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3644                 }else
3645                     bits+= esc_length;
3646                 run=0;
3647             }else
3648                 run++;
3649         }
3650         i= scantable[last];
3651
3652         level= temp[i] + 64;
3653
3654         assert(level - 64);
3655
3656         if((level&(~127)) == 0){
3657             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3658         }else
3659             bits+= esc_length;
3660
3661     }
3662
3663     if(last>=0){
3664         if(s->mb_intra)
3665             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3666         else
3667             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3668     }
3669
3670     s->dsp.idct_add(bak, stride, temp);
3671
3672     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3673
3674     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3675 }
3676
3677 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3678     MpegEncContext * const s= (MpegEncContext *)c;
3679     const uint8_t *scantable= s->intra_scantable.permutated;
3680     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3681     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3682     int i, last, run, bits, level, start_i;
3683     const int esc_length= s->ac_esc_length;
3684     uint8_t * length;
3685     uint8_t * last_length;
3686
3687     assert(h==8);
3688
3689     s->dsp.diff_pixels(temp, src1, src2, stride);
3690
3691     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3692
3693     bits=0;
3694
3695     if (s->mb_intra) {
3696         start_i = 1;
3697         length     = s->intra_ac_vlc_length;
3698         last_length= s->intra_ac_vlc_last_length;
3699         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3700     } else {
3701         start_i = 0;
3702         length     = s->inter_ac_vlc_length;
3703         last_length= s->inter_ac_vlc_last_length;
3704     }
3705
3706     if(last>=start_i){
3707         run=0;
3708         for(i=start_i; i<last; i++){
3709             int j= scantable[i];
3710             level= temp[j];
3711
3712             if(level){
3713                 level+=64;
3714                 if((level&(~127)) == 0){
3715                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3716                 }else
3717                     bits+= esc_length;
3718                 run=0;
3719             }else
3720                 run++;
3721         }
3722         i= scantable[last];
3723
3724         level= temp[i] + 64;
3725
3726         assert(level - 64);
3727
3728         if((level&(~127)) == 0){
3729             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3730         }else
3731             bits+= esc_length;
3732     }
3733
3734     return bits;
3735 }
3736
3737 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3738     int score=0;
3739     int x,y;
3740
3741     for(y=1; y<h; y++){
3742         for(x=0; x<16; x+=4){
3743             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3744                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3745         }
3746         s+= stride;
3747     }
3748
3749     return score;
3750 }
3751
3752 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3753     int score=0;
3754     int x,y;
3755
3756     for(y=1; y<h; y++){
3757         for(x=0; x<16; x++){
3758             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3759         }
3760         s1+= stride;
3761         s2+= stride;
3762     }
3763
3764     return score;
3765 }
3766
3767 #define SQ(a) ((a)*(a))
3768 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3769     int score=0;
3770     int x,y;
3771
3772     for(y=1; y<h; y++){
3773         for(x=0; x<16; x+=4){
3774             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3775                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3776         }
3777         s+= stride;
3778     }
3779
3780     return score;
3781 }
3782
3783 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3784     int score=0;
3785     int x,y;
3786
3787     for(y=1; y<h; y++){
3788         for(x=0; x<16; x++){
3789             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3790         }
3791         s1+= stride;
3792         s2+= stride;
3793     }
3794
3795     return score;
3796 }
3797
3798 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3799                                int size){
3800     int score=0;
3801     int i;
3802     for(i=0; i<size; i++)
3803         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3804     return score;
3805 }
3806
3807 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3808 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3809 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3810 #ifdef CONFIG_GPL
3811 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3812 #endif
3813 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3814 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3815 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3816 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3817
3818 static void vector_fmul_c(float *dst, const float *src, int len){
3819     int i;
3820     for(i=0; i<len; i++)
3821         dst[i] *= src[i];
3822 }
3823
3824 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3825     int i;
3826     src1 += len-1;
3827     for(i=0; i<len; i++)
3828         dst[i] = src0[i] * src1[-i];
3829 }
3830
3831 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3832     int i;
3833     for(i=0; i<len; i++)
3834         dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3835 }
3836
3837 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3838     int i;
3839     for(i=0; i<len; i++) {
3840         int_fast32_t tmp = ((const int32_t*)src)[i];
3841         if(tmp & 0xf0000){
3842             tmp = (0x43c0ffff - tmp)>>31;
3843             // is this faster on some gcc/cpu combinations?
3844 //          if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3845 //          else                 tmp = 0;
3846         }
3847         dst[i] = tmp - 0x8000;
3848     }
3849 }
3850
3851 #define W0 2048
3852 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3853 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3854 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3855 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3856 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3857 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3858 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3859
3860 static void wmv2_idct_row(short * b)
3861 {
3862     int s1,s2;
3863     int a0,a1,a2,a3,a4,a5,a6,a7;
3864     /*step 1*/
3865     a1 = W1*b[1]+W7*b[7];
3866     a7 = W7*b[1]-W1*b[7];
3867     a5 = W5*b[5]+W3*b[3];
3868     a3 = W3*b[5]-W5*b[3];
3869     a2 = W2*b[2]+W6*b[6];
3870     a6 = W6*b[2]-W2*b[6];
3871     a0 = W0*b[0]+W0*b[4];
3872     a4 = W0*b[0]-W0*b[4];
3873     /*step 2*/
3874     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3875     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3876     /*step 3*/
3877     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3878     b[1] = (a4+a6 +s1   + (1<<7))>>8;
3879     b[2] = (a4-a6 +s2   + (1<<7))>>8;
3880     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3881     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3882     b[5] = (a4-a6 -s2   + (1<<7))>>8;
3883     b[6] = (a4+a6 -s1   + (1<<7))>>8;
3884     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3885 }
3886 static void wmv2_idct_col(short * b)
3887 {
3888     int s1,s2;
3889     int a0,a1,a2,a3,a4,a5,a6,a7;
3890     /*step 1, with extended precision*/
3891     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3892     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3893     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3894     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3895     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3896     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3897     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
3898     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
3899     /*step 2*/
3900     s1 = (181*(a1-a5+a7-a3)+128)>>8;
3901     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3902     /*step 3*/
3903     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3904     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
3905     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
3906     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3907
3908     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3909     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
3910     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
3911     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3912 }
3913 void ff_wmv2_idct_c(short * block){
3914     int i;
3915
3916     for(i=0;i<64;i+=8){
3917         wmv2_idct_row(block+i);
3918     }
3919     for(i=0;i<8;i++){
3920         wmv2_idct_col(block+i);
3921     }
3922 }
3923 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3924  converted */
3925 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3926 {
3927     ff_wmv2_idct_c(block);
3928     put_pixels_clamped_c(block, dest, line_size);
3929 }
3930 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3931 {
3932     ff_wmv2_idct_c(block);
3933     add_pixels_clamped_c(block, dest, line_size);
3934 }
3935 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3936 {
3937     j_rev_dct (block);
3938     put_pixels_clamped_c(block, dest, line_size);
3939 }
3940 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3941 {
3942     j_rev_dct (block);
3943     add_pixels_clamped_c(block, dest, line_size);
3944 }
3945
3946 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3947 {
3948     j_rev_dct4 (block);
3949     put_pixels_clamped4_c(block, dest, line_size);
3950 }
3951 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3952 {
3953     j_rev_dct4 (block);
3954     add_pixels_clamped4_c(block, dest, line_size);
3955 }
3956
3957 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3958 {
3959     j_rev_dct2 (block);
3960     put_pixels_clamped2_c(block, dest, line_size);
3961 }
3962 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3963 {
3964     j_rev_dct2 (block);
3965     add_pixels_clamped2_c(block, dest, line_size);
3966 }
3967
3968 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3969 {
3970     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3971
3972     dest[0] = cm[(block[0] + 4)>>3];
3973 }
3974 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3975 {
3976     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3977
3978     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3979 }
3980
3981 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
3982
3983 /* init static data */
3984 void dsputil_static_init(void)
3985 {
3986     int i;
3987
3988     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3989     for(i=0;i<MAX_NEG_CROP;i++) {
3990         ff_cropTbl[i] = 0;
3991         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3992     }
3993
3994     for(i=0;i<512;i++) {
3995         ff_squareTbl[i] = (i - 256) * (i - 256);
3996     }
3997
3998     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3999 }
4000
4001 int ff_check_alignment(void){
4002     static int did_fail=0;
4003     DECLARE_ALIGNED_16(int, aligned);
4004
4005     if((long)&aligned & 15){
4006         if(!did_fail){
4007 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
4008             av_log(NULL, AV_LOG_ERROR,
4009                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4010                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4011                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4012                 "Do not report crashes to FFmpeg developers.\n");
4013 #endif
4014             did_fail=1;
4015         }
4016         return -1;
4017     }
4018     return 0;
4019 }
4020
4021 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4022 {
4023     int i;
4024
4025     ff_check_alignment();
4026
4027 #ifdef CONFIG_ENCODERS
4028     if(avctx->dct_algo==FF_DCT_FASTINT) {
4029         c->fdct = fdct_ifast;
4030         c->fdct248 = fdct_ifast248;
4031     }
4032     else if(avctx->dct_algo==FF_DCT_FAAN) {
4033         c->fdct = ff_faandct;
4034         c->fdct248 = ff_faandct248;
4035     }
4036     else {
4037         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4038         c->fdct248 = ff_fdct248_islow;
4039     }
4040 #endif //CONFIG_ENCODERS
4041
4042     if(avctx->lowres==1){
4043         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4044             c->idct_put= ff_jref_idct4_put;
4045             c->idct_add= ff_jref_idct4_add;
4046         }else{
4047             c->idct_put= ff_h264_lowres_idct_put_c;
4048             c->idct_add= ff_h264_lowres_idct_add_c;
4049         }
4050         c->idct    = j_rev_dct4;
4051         c->idct_permutation_type= FF_NO_IDCT_PERM;
4052     }else if(avctx->lowres==2){
4053         c->idct_put= ff_jref_idct2_put;
4054         c->idct_add= ff_jref_idct2_add;
4055         c->idct    = j_rev_dct2;
4056         c->idct_permutation_type= FF_NO_IDCT_PERM;
4057     }else if(avctx->lowres==3){
4058         c->idct_put= ff_jref_idct1_put;
4059         c->idct_add= ff_jref_idct1_add;
4060         c->idct    = j_rev_dct1;
4061         c->idct_permutation_type= FF_NO_IDCT_PERM;
4062     }else{
4063         if(avctx->idct_algo==FF_IDCT_INT){
4064             c->idct_put= ff_jref_idct_put;
4065             c->idct_add= ff_jref_idct_add;
4066             c->idct    = j_rev_dct;
4067             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4068         }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4069                 avctx->idct_algo==FF_IDCT_VP3){
4070             c->idct_put= ff_vp3_idct_put_c;
4071             c->idct_add= ff_vp3_idct_add_c;
4072             c->idct    = ff_vp3_idct_c;
4073             c->idct_permutation_type= FF_NO_IDCT_PERM;
4074         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4075             c->idct_put= ff_wmv2_idct_put_c;
4076             c->idct_add= ff_wmv2_idct_add_c;
4077             c->idct    = ff_wmv2_idct_c;
4078             c->idct_permutation_type= FF_NO_IDCT_PERM;
4079         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4080             c->idct_put= ff_faanidct_put;
4081             c->idct_add= ff_faanidct_add;
4082             c->idct    = ff_faanidct;
4083             c->idct_permutation_type= FF_NO_IDCT_PERM;
4084         }else{ //accurate/default
4085             c->idct_put= ff_simple_idct_put;
4086             c->idct_add= ff_simple_idct_add;
4087             c->idct    = ff_simple_idct;
4088             c->idct_permutation_type= FF_NO_IDCT_PERM;
4089         }
4090     }
4091
4092     if (ENABLE_H264_DECODER) {
4093         c->h264_idct_add= ff_h264_idct_add_c;
4094         c->h264_idct8_add= ff_h264_idct8_add_c;
4095         c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4096         c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4097     }
4098
4099     c->get_pixels = get_pixels_c;
4100     c->diff_pixels = diff_pixels_c;
4101     c->put_pixels_clamped = put_pixels_clamped_c;
4102     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4103     c->add_pixels_clamped = add_pixels_clamped_c;
4104     c->add_pixels8 = add_pixels8_c;
4105     c->add_pixels4 = add_pixels4_c;
4106     c->sum_abs_dctelem = sum_abs_dctelem_c;
4107     c->gmc1 = gmc1_c;
4108     c->gmc = ff_gmc_c;
4109     c->clear_blocks = clear_blocks_c;
4110     c->pix_sum = pix_sum_c;
4111     c->pix_norm1 = pix_norm1_c;
4112
4113     /* TODO [0] 16  [1] 8 */
4114     c->pix_abs[0][0] = pix_abs16_c;
4115     c->pix_abs[0][1] = pix_abs16_x2_c;
4116     c->pix_abs[0][2] = pix_abs16_y2_c;
4117     c->pix_abs[0][3] = pix_abs16_xy2_c;
4118     c->pix_abs[1][0] = pix_abs8_c;
4119     c->pix_abs[1][1] = pix_abs8_x2_c;
4120     c->pix_abs[1][2] = pix_abs8_y2_c;
4121     c->pix_abs[1][3] = pix_abs8_xy2_c;
4122
4123 #define dspfunc(PFX, IDX, NUM) \
4124     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4125     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4126     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4127     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4128
4129     dspfunc(put, 0, 16);
4130     dspfunc(put_no_rnd, 0, 16);
4131     dspfunc(put, 1, 8);
4132     dspfunc(put_no_rnd, 1, 8);
4133     dspfunc(put, 2, 4);
4134     dspfunc(put, 3, 2);
4135
4136     dspfunc(avg, 0, 16);
4137     dspfunc(avg_no_rnd, 0, 16);
4138     dspfunc(avg, 1, 8);
4139     dspfunc(avg_no_rnd, 1, 8);
4140     dspfunc(avg, 2, 4);
4141     dspfunc(avg, 3, 2);
4142 #undef dspfunc
4143
4144     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4145     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4146
4147     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4148     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4149     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4150     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4151     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4152     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4153     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4154     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4155     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4156
4157     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4158     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4159     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4160     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4161     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4162     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4163     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4164     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4165     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4166
4167 #define dspfunc(PFX, IDX, NUM) \
4168     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4169     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4170     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4171     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4172     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4173     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4174     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4175     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4176     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4177     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4178     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4179     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4180     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4181     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4182     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4183     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4184
4185     dspfunc(put_qpel, 0, 16);
4186     dspfunc(put_no_rnd_qpel, 0, 16);
4187
4188     dspfunc(avg_qpel, 0, 16);
4189     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4190
4191     dspfunc(put_qpel, 1, 8);
4192     dspfunc(put_no_rnd_qpel, 1, 8);
4193
4194     dspfunc(avg_qpel, 1, 8);
4195     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4196
4197     dspfunc(put_h264_qpel, 0, 16);
4198     dspfunc(put_h264_qpel, 1, 8);
4199     dspfunc(put_h264_qpel, 2, 4);
4200     dspfunc(put_h264_qpel, 3, 2);
4201     dspfunc(avg_h264_qpel, 0, 16);
4202     dspfunc(avg_h264_qpel, 1, 8);
4203     dspfunc(avg_h264_qpel, 2, 4);
4204
4205 #undef dspfunc
4206     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4207     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4208     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4209     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4210     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4211     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4212     c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4213
4214     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4215     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4216     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4217     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4218     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4219     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4220     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4221     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4222     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4223     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4224     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4225     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4226     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4227     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4228     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4229     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4230     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4231     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4232     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4233     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4234
4235     c->draw_edges = draw_edges_c;
4236
4237 #ifdef CONFIG_CAVS_DECODER
4238     ff_cavsdsp_init(c,avctx);
4239 #endif
4240 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4241     ff_vc1dsp_init(c,avctx);
4242 #endif
4243 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4244     ff_intrax8dsp_init(c,avctx);
4245 #endif
4246 #if defined(CONFIG_H264_ENCODER)
4247     ff_h264dspenc_init(c,avctx);
4248 #endif
4249
4250     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4251     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4252     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4253     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4254     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4255     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4256     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4257     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4258
4259 #define SET_CMP_FUNC(name) \
4260     c->name[0]= name ## 16_c;\
4261     c->name[1]= name ## 8x8_c;
4262
4263     SET_CMP_FUNC(hadamard8_diff)
4264     c->hadamard8_diff[4]= hadamard8_intra16_c;
4265     SET_CMP_FUNC(dct_sad)
4266     SET_CMP_FUNC(dct_max)
4267 #ifdef CONFIG_GPL
4268     SET_CMP_FUNC(dct264_sad)
4269 #endif
4270     c->sad[0]= pix_abs16_c;
4271     c->sad[1]= pix_abs8_c;
4272     c->sse[0]= sse16_c;
4273     c->sse[1]= sse8_c;
4274     c->sse[2]= sse4_c;
4275     SET_CMP_FUNC(quant_psnr)
4276     SET_CMP_FUNC(rd)
4277     SET_CMP_FUNC(bit)
4278     c->vsad[0]= vsad16_c;
4279     c->vsad[4]= vsad_intra16_c;
4280     c->vsse[0]= vsse16_c;
4281     c->vsse[4]= vsse_intra16_c;
4282     c->nsse[0]= nsse16_c;
4283     c->nsse[1]= nsse8_c;
4284 #ifdef CONFIG_SNOW_ENCODER
4285     c->w53[0]= w53_16_c;
4286     c->w53[1]= w53_8_c;
4287     c->w97[0]= w97_16_c;
4288     c->w97[1]= w97_8_c;
4289 #endif
4290
4291     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4292
4293     c->add_bytes= add_bytes_c;
4294     c->add_bytes_l2= add_bytes_l2_c;
4295     c->diff_bytes= diff_bytes_c;
4296     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4297     c->bswap_buf= bswap_buf;
4298 #ifdef CONFIG_PNG_DECODER
4299     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4300 #endif
4301
4302     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4303     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4304     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4305     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4306     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4307     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4308     c->h264_loop_filter_strength= NULL;
4309
4310     if (ENABLE_ANY_H263) {
4311         c->h263_h_loop_filter= h263_h_loop_filter_c;
4312         c->h263_v_loop_filter= h263_v_loop_filter_c;
4313     }
4314
4315     c->h261_loop_filter= h261_loop_filter_c;
4316
4317     c->try_8x8basis= try_8x8basis_c;
4318     c->add_8x8basis= add_8x8basis_c;
4319
4320 #ifdef CONFIG_SNOW_DECODER
4321     c->vertical_compose97i = ff_snow_vertical_compose97i;
4322     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4323     c->inner_add_yblock = ff_snow_inner_add_yblock;
4324 #endif
4325
4326 #ifdef CONFIG_VORBIS_DECODER
4327     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4328 #endif
4329 #ifdef CONFIG_FLAC_ENCODER
4330     c->flac_compute_autocorr = ff_flac_compute_autocorr;
4331 #endif
4332     c->vector_fmul = vector_fmul_c;
4333     c->vector_fmul_reverse = vector_fmul_reverse_c;
4334     c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4335     c->float_to_int16 = ff_float_to_int16_c;
4336
4337     c->shrink[0]= ff_img_copy_plane;
4338     c->shrink[1]= ff_shrink22;
4339     c->shrink[2]= ff_shrink44;
4340     c->shrink[3]= ff_shrink88;
4341
4342     c->prefetch= just_return;
4343
4344     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4345     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4346
4347     if (ENABLE_MMX)      dsputil_init_mmx   (c, avctx);
4348     if (ENABLE_ARMV4L)   dsputil_init_armv4l(c, avctx);
4349     if (ENABLE_MLIB)     dsputil_init_mlib  (c, avctx);
4350     if (ENABLE_VIS)      dsputil_init_vis   (c, avctx);
4351     if (ENABLE_ALPHA)    dsputil_init_alpha (c, avctx);
4352     if (ENABLE_POWERPC)  dsputil_init_ppc   (c, avctx);
4353     if (ENABLE_MMI)      dsputil_init_mmi   (c, avctx);
4354     if (ENABLE_SH4)      dsputil_init_sh4   (c, avctx);
4355     if (ENABLE_BFIN)     dsputil_init_bfin  (c, avctx);
4356
4357     for(i=0; i<64; i++){
4358         if(!c->put_2tap_qpel_pixels_tab[0][i])
4359             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4360         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4361             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4362     }
4363
4364     switch(c->idct_permutation_type){
4365     case FF_NO_IDCT_PERM:
4366         for(i=0; i<64; i++)
4367             c->idct_permutation[i]= i;
4368         break;
4369     case FF_LIBMPEG2_IDCT_PERM:
4370         for(i=0; i<64; i++)
4371             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4372         break;
4373     case FF_SIMPLE_IDCT_PERM:
4374         for(i=0; i<64; i++)
4375             c->idct_permutation[i]= simple_mmx_permutation[i];
4376         break;
4377     case FF_TRANSPOSE_IDCT_PERM:
4378         for(i=0; i<64; i++)
4379             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4380         break;
4381     case FF_PARTTRANS_IDCT_PERM:
4382         for(i=0; i<64; i++)
4383             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4384         break;
4385     default:
4386         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4387     }
4388 }
4389