git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file libavcodec/dsputil.c
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "simple_idct.h"
  33 #include "faandct.h"
  34 #include "faanidct.h"
  35 #include "mathops.h"
  36 #include "snow.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39
  40 /* snow.c */
  41 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  42
  43 /* vorbis.c */
  44 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
  45
  46 /* ac3dec.c */
  47 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
  48
  49 /* lpc.c */
  50 void ff_lpc_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
  51
  52 /* pngdec.c */
  53 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
  54
  55 /* eaidct.c */
  56 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
  57
  58 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  59 uint32_t ff_squareTbl[512] = {0, };
  60
  61 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  62 #define pb_7f (~0UL/255 * 0x7f)
  63 #define pb_80 (~0UL/255 * 0x80)
  64
  65 const uint8_t ff_zigzag_direct[64] = {
  66     0,   1,  8, 16,  9,  2,  3, 10,
  67     17, 24, 32, 25, 18, 11,  4,  5,
  68     12, 19, 26, 33, 40, 48, 41, 34,
  69     27, 20, 13,  6,  7, 14, 21, 28,
  70     35, 42, 49, 56, 57, 50, 43, 36,
  71     29, 22, 15, 23, 30, 37, 44, 51,
  72     58, 59, 52, 45, 38, 31, 39, 46,
  73     53, 60, 61, 54, 47, 55, 62, 63
  74 };
  75
  76 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  77    specification, we interleave the fields */
  78 const uint8_t ff_zigzag248_direct[64] = {
  79      0,  8,  1,  9, 16, 24,  2, 10,
  80     17, 25, 32, 40, 48, 56, 33, 41,
  81     18, 26,  3, 11,  4, 12, 19, 27,
  82     34, 42, 49, 57, 50, 58, 35, 43,
  83     20, 28,  5, 13,  6, 14, 21, 29,
  84     36, 44, 51, 59, 52, 60, 37, 45,
  85     22, 30,  7, 15, 23, 31, 38, 46,
  86     53, 61, 54, 62, 39, 47, 55, 63,
  87 };
  88
  89 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  90 DECLARE_ALIGNED_16(uint16_t, inv_zigzag_direct16)[64];
  91
  92 const uint8_t ff_alternate_horizontal_scan[64] = {
  93     0,  1,   2,  3,  8,  9, 16, 17,
  94     10, 11,  4,  5,  6,  7, 15, 14,
  95     13, 12, 19, 18, 24, 25, 32, 33,
  96     26, 27, 20, 21, 22, 23, 28, 29,
  97     30, 31, 34, 35, 40, 41, 48, 49,
  98     42, 43, 36, 37, 38, 39, 44, 45,
  99     46, 47, 50, 51, 56, 57, 58, 59,
 100     52, 53, 54, 55, 60, 61, 62, 63,
 101 };
 102
 103 const uint8_t ff_alternate_vertical_scan[64] = {
 104     0,  8,  16, 24,  1,  9,  2, 10,
 105     17, 25, 32, 40, 48, 56, 57, 49,
 106     41, 33, 26, 18,  3, 11,  4, 12,
 107     19, 27, 34, 42, 50, 58, 35, 43,
 108     51, 59, 20, 28,  5, 13,  6, 14,
 109     21, 29, 36, 44, 52, 60, 37, 45,
 110     53, 61, 22, 30,  7, 15, 23, 31,
 111     38, 46, 54, 62, 39, 47, 55, 63,
 112 };
 113
 114 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
 115  * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
 116 const uint32_t ff_inverse[257]={
 117          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 118  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 119  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 120  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 121  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 122  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 123   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 124   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 125   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 126   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 127   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 128   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 129   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 130   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 131   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 132   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 133   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 134   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 135   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 136   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 137   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 138   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 139   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 140   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 141   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 142   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 143   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 144   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 145   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 146   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 147   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 148   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 149   16777216
 150 };
 151
 152 /* Input permutation for the simple_idct_mmx */
 153 static const uint8_t simple_mmx_permutation[64]={
 154         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 155         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 156         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 157         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 158         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 159         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 160         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 161         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 162 };
 163
 164 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 165
 166 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 167     int i;
 168     int end;
 169
 170     st->scantable= src_scantable;
 171
 172     for(i=0; i<64; i++){
 173         int j;
 174         j = src_scantable[i];
 175         st->permutated[i] = permutation[j];
 176 #if ARCH_PPC
 177         st->inverse[j] = i;
 178 #endif
 179     }
 180
 181     end=-1;
 182     for(i=0; i<64; i++){
 183         int j;
 184         j = st->permutated[i];
 185         if(j>end) end=j;
 186         st->raster_end[i]= end;
 187     }
 188 }
 189
 190 static int pix_sum_c(uint8_t * pix, int line_size)
 191 {
 192     int s, i, j;
 193
 194     s = 0;
 195     for (i = 0; i < 16; i++) {
 196         for (j = 0; j < 16; j += 8) {
 197             s += pix[0];
 198             s += pix[1];
 199             s += pix[2];
 200             s += pix[3];
 201             s += pix[4];
 202             s += pix[5];
 203             s += pix[6];
 204             s += pix[7];
 205             pix += 8;
 206         }
 207         pix += line_size - 16;
 208     }
 209     return s;
 210 }
 211
 212 static int pix_norm1_c(uint8_t * pix, int line_size)
 213 {
 214     int s, i, j;
 215     uint32_t *sq = ff_squareTbl + 256;
 216
 217     s = 0;
 218     for (i = 0; i < 16; i++) {
 219         for (j = 0; j < 16; j += 8) {
 220 #if 0
 221             s += sq[pix[0]];
 222             s += sq[pix[1]];
 223             s += sq[pix[2]];
 224             s += sq[pix[3]];
 225             s += sq[pix[4]];
 226             s += sq[pix[5]];
 227             s += sq[pix[6]];
 228             s += sq[pix[7]];
 229 #else
 230 #if LONG_MAX > 2147483647
 231             register uint64_t x=*(uint64_t*)pix;
 232             s += sq[x&0xff];
 233             s += sq[(x>>8)&0xff];
 234             s += sq[(x>>16)&0xff];
 235             s += sq[(x>>24)&0xff];
 236             s += sq[(x>>32)&0xff];
 237             s += sq[(x>>40)&0xff];
 238             s += sq[(x>>48)&0xff];
 239             s += sq[(x>>56)&0xff];
 240 #else
 241             register uint32_t x=*(uint32_t*)pix;
 242             s += sq[x&0xff];
 243             s += sq[(x>>8)&0xff];
 244             s += sq[(x>>16)&0xff];
 245             s += sq[(x>>24)&0xff];
 246             x=*(uint32_t*)(pix+4);
 247             s += sq[x&0xff];
 248             s += sq[(x>>8)&0xff];
 249             s += sq[(x>>16)&0xff];
 250             s += sq[(x>>24)&0xff];
 251 #endif
 252 #endif
 253             pix += 8;
 254         }
 255         pix += line_size - 16;
 256     }
 257     return s;
 258 }
 259
 260 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 261     int i;
 262
 263     for(i=0; i+8<=w; i+=8){
 264         dst[i+0]= bswap_32(src[i+0]);
 265         dst[i+1]= bswap_32(src[i+1]);
 266         dst[i+2]= bswap_32(src[i+2]);
 267         dst[i+3]= bswap_32(src[i+3]);
 268         dst[i+4]= bswap_32(src[i+4]);
 269         dst[i+5]= bswap_32(src[i+5]);
 270         dst[i+6]= bswap_32(src[i+6]);
 271         dst[i+7]= bswap_32(src[i+7]);
 272     }
 273     for(;i<w; i++){
 274         dst[i+0]= bswap_32(src[i+0]);
 275     }
 276 }
 277
 278 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 279 {
 280     int s, i;
 281     uint32_t *sq = ff_squareTbl + 256;
 282
 283     s = 0;
 284     for (i = 0; i < h; i++) {
 285         s += sq[pix1[0] - pix2[0]];
 286         s += sq[pix1[1] - pix2[1]];
 287         s += sq[pix1[2] - pix2[2]];
 288         s += sq[pix1[3] - pix2[3]];
 289         pix1 += line_size;
 290         pix2 += line_size;
 291     }
 292     return s;
 293 }
 294
 295 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 296 {
 297     int s, i;
 298     uint32_t *sq = ff_squareTbl + 256;
 299
 300     s = 0;
 301     for (i = 0; i < h; i++) {
 302         s += sq[pix1[0] - pix2[0]];
 303         s += sq[pix1[1] - pix2[1]];
 304         s += sq[pix1[2] - pix2[2]];
 305         s += sq[pix1[3] - pix2[3]];
 306         s += sq[pix1[4] - pix2[4]];
 307         s += sq[pix1[5] - pix2[5]];
 308         s += sq[pix1[6] - pix2[6]];
 309         s += sq[pix1[7] - pix2[7]];
 310         pix1 += line_size;
 311         pix2 += line_size;
 312     }
 313     return s;
 314 }
 315
 316 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 317 {
 318     int s, i;
 319     uint32_t *sq = ff_squareTbl + 256;
 320
 321     s = 0;
 322     for (i = 0; i < h; i++) {
 323         s += sq[pix1[ 0] - pix2[ 0]];
 324         s += sq[pix1[ 1] - pix2[ 1]];
 325         s += sq[pix1[ 2] - pix2[ 2]];
 326         s += sq[pix1[ 3] - pix2[ 3]];
 327         s += sq[pix1[ 4] - pix2[ 4]];
 328         s += sq[pix1[ 5] - pix2[ 5]];
 329         s += sq[pix1[ 6] - pix2[ 6]];
 330         s += sq[pix1[ 7] - pix2[ 7]];
 331         s += sq[pix1[ 8] - pix2[ 8]];
 332         s += sq[pix1[ 9] - pix2[ 9]];
 333         s += sq[pix1[10] - pix2[10]];
 334         s += sq[pix1[11] - pix2[11]];
 335         s += sq[pix1[12] - pix2[12]];
 336         s += sq[pix1[13] - pix2[13]];
 337         s += sq[pix1[14] - pix2[14]];
 338         s += sq[pix1[15] - pix2[15]];
 339
 340         pix1 += line_size;
 341         pix2 += line_size;
 342     }
 343     return s;
 344 }
 345
 346
 347 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
 348 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 349     int s, i, j;
 350     const int dec_count= w==8 ? 3 : 4;
 351     int tmp[32*32];
 352     int level, ori;
 353     static const int scale[2][2][4][4]={
 354       {
 355         {
 356             // 9/7 8x8 dec=3
 357             {268, 239, 239, 213},
 358             {  0, 224, 224, 152},
 359             {  0, 135, 135, 110},
 360         },{
 361             // 9/7 16x16 or 32x32 dec=4
 362             {344, 310, 310, 280},
 363             {  0, 320, 320, 228},
 364             {  0, 175, 175, 136},
 365             {  0, 129, 129, 102},
 366         }
 367       },{
 368         {
 369             // 5/3 8x8 dec=3
 370             {275, 245, 245, 218},
 371             {  0, 230, 230, 156},
 372             {  0, 138, 138, 113},
 373         },{
 374             // 5/3 16x16 or 32x32 dec=4
 375             {352, 317, 317, 286},
 376             {  0, 328, 328, 233},
 377             {  0, 180, 180, 140},
 378             {  0, 132, 132, 105},
 379         }
 380       }
 381     };
 382
 383     for (i = 0; i < h; i++) {
 384         for (j = 0; j < w; j+=4) {
 385             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 386             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 387             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 388             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 389         }
 390         pix1 += line_size;
 391         pix2 += line_size;
 392     }
 393
 394     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 395
 396     s=0;
 397     assert(w==h);
 398     for(level=0; level<dec_count; level++){
 399         for(ori= level ? 1 : 0; ori<4; ori++){
 400             int size= w>>(dec_count-level);
 401             int sx= (ori&1) ? size : 0;
 402             int stride= 32<<(dec_count-level);
 403             int sy= (ori&2) ? stride>>1 : 0;
 404
 405             for(i=0; i<size; i++){
 406                 for(j=0; j<size; j++){
 407                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 408                     s += FFABS(v);
 409                 }
 410             }
 411         }
 412     }
 413     assert(s>=0);
 414     return s>>9;
 415 }
 416
 417 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 418     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 419 }
 420
 421 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 422     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 423 }
 424
 425 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 426     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 427 }
 428
 429 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 430     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 431 }
 432
 433 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 434     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 435 }
 436
 437 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 438     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 439 }
 440 #endif
 441
 442 /* draw the edges of width 'w' of an image of size width, height */
 443 //FIXME check that this is ok for mpeg4 interlaced
 444 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 445 {
 446     uint8_t *ptr, *last_line;
 447     int i;
 448
 449     last_line = buf + (height - 1) * wrap;
 450     for(i=0;i<w;i++) {
 451         /* top and bottom */
 452         memcpy(buf - (i + 1) * wrap, buf, width);
 453         memcpy(last_line + (i + 1) * wrap, last_line, width);
 454     }
 455     /* left and right */
 456     ptr = buf;
 457     for(i=0;i<height;i++) {
 458         memset(ptr - w, ptr[0], w);
 459         memset(ptr + width, ptr[width-1], w);
 460         ptr += wrap;
 461     }
 462     /* corners */
 463     for(i=0;i<w;i++) {
 464         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 465         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 466         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 467         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 468     }
 469 }
 470
 471 /**
 472  * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
 473  * @param buf destination buffer
 474  * @param src source buffer
 475  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 476  * @param block_w width of block
 477  * @param block_h height of block
 478  * @param src_x x coordinate of the top left sample of the block in the source buffer
 479  * @param src_y y coordinate of the top left sample of the block in the source buffer
 480  * @param w width of the source buffer
 481  * @param h height of the source buffer
 482  */
 483 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
 484                                     int src_x, int src_y, int w, int h){
 485     int x, y;
 486     int start_y, start_x, end_y, end_x;
 487
 488     if(src_y>= h){
 489         src+= (h-1-src_y)*linesize;
 490         src_y=h-1;
 491     }else if(src_y<=-block_h){
 492         src+= (1-block_h-src_y)*linesize;
 493         src_y=1-block_h;
 494     }
 495     if(src_x>= w){
 496         src+= (w-1-src_x);
 497         src_x=w-1;
 498     }else if(src_x<=-block_w){
 499         src+= (1-block_w-src_x);
 500         src_x=1-block_w;
 501     }
 502
 503     start_y= FFMAX(0, -src_y);
 504     start_x= FFMAX(0, -src_x);
 505     end_y= FFMIN(block_h, h-src_y);
 506     end_x= FFMIN(block_w, w-src_x);
 507
 508     // copy existing part
 509     for(y=start_y; y<end_y; y++){
 510         for(x=start_x; x<end_x; x++){
 511             buf[x + y*linesize]= src[x + y*linesize];
 512         }
 513     }
 514
 515     //top
 516     for(y=0; y<start_y; y++){
 517         for(x=start_x; x<end_x; x++){
 518             buf[x + y*linesize]= buf[x + start_y*linesize];
 519         }
 520     }
 521
 522     //bottom
 523     for(y=end_y; y<block_h; y++){
 524         for(x=start_x; x<end_x; x++){
 525             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
 526         }
 527     }
 528
 529     for(y=0; y<block_h; y++){
 530        //left
 531         for(x=0; x<start_x; x++){
 532             buf[x + y*linesize]= buf[start_x + y*linesize];
 533         }
 534
 535        //right
 536         for(x=end_x; x<block_w; x++){
 537             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
 538         }
 539     }
 540 }
 541
 542 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 543 {
 544     int i;
 545
 546     /* read the pixels */
 547     for(i=0;i<8;i++) {
 548         block[0] = pixels[0];
 549         block[1] = pixels[1];
 550         block[2] = pixels[2];
 551         block[3] = pixels[3];
 552         block[4] = pixels[4];
 553         block[5] = pixels[5];
 554         block[6] = pixels[6];
 555         block[7] = pixels[7];
 556         pixels += line_size;
 557         block += 8;
 558     }
 559 }
 560
 561 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 562                           const uint8_t *s2, int stride){
 563     int i;
 564
 565     /* read the pixels */
 566     for(i=0;i<8;i++) {
 567         block[0] = s1[0] - s2[0];
 568         block[1] = s1[1] - s2[1];
 569         block[2] = s1[2] - s2[2];
 570         block[3] = s1[3] - s2[3];
 571         block[4] = s1[4] - s2[4];
 572         block[5] = s1[5] - s2[5];
 573         block[6] = s1[6] - s2[6];
 574         block[7] = s1[7] - s2[7];
 575         s1 += stride;
 576         s2 += stride;
 577         block += 8;
 578     }
 579 }
 580
 581
 582 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 583                                  int line_size)
 584 {
 585     int i;
 586     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 587
 588     /* read the pixels */
 589     for(i=0;i<8;i++) {
 590         pixels[0] = cm[block[0]];
 591         pixels[1] = cm[block[1]];
 592         pixels[2] = cm[block[2]];
 593         pixels[3] = cm[block[3]];
 594         pixels[4] = cm[block[4]];
 595         pixels[5] = cm[block[5]];
 596         pixels[6] = cm[block[6]];
 597         pixels[7] = cm[block[7]];
 598
 599         pixels += line_size;
 600         block += 8;
 601     }
 602 }
 603
 604 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 605                                  int line_size)
 606 {
 607     int i;
 608     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 609
 610     /* read the pixels */
 611     for(i=0;i<4;i++) {
 612         pixels[0] = cm[block[0]];
 613         pixels[1] = cm[block[1]];
 614         pixels[2] = cm[block[2]];
 615         pixels[3] = cm[block[3]];
 616
 617         pixels += line_size;
 618         block += 8;
 619     }
 620 }
 621
 622 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 623                                  int line_size)
 624 {
 625     int i;
 626     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 627
 628     /* read the pixels */
 629     for(i=0;i<2;i++) {
 630         pixels[0] = cm[block[0]];
 631         pixels[1] = cm[block[1]];
 632
 633         pixels += line_size;
 634         block += 8;
 635     }
 636 }
 637
 638 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 639                                         uint8_t *restrict pixels,
 640                                         int line_size)
 641 {
 642     int i, j;
 643
 644     for (i = 0; i < 8; i++) {
 645         for (j = 0; j < 8; j++) {
 646             if (*block < -128)
 647                 *pixels = 0;
 648             else if (*block > 127)
 649                 *pixels = 255;
 650             else
 651                 *pixels = (uint8_t)(*block + 128);
 652             block++;
 653             pixels++;
 654         }
 655         pixels += (line_size - 8);
 656     }
 657 }
 658
 659 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 660                           int line_size)
 661 {
 662     int i;
 663     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 664
 665     /* read the pixels */
 666     for(i=0;i<8;i++) {
 667         pixels[0] = cm[pixels[0] + block[0]];
 668         pixels[1] = cm[pixels[1] + block[1]];
 669         pixels[2] = cm[pixels[2] + block[2]];
 670         pixels[3] = cm[pixels[3] + block[3]];
 671         pixels[4] = cm[pixels[4] + block[4]];
 672         pixels[5] = cm[pixels[5] + block[5]];
 673         pixels[6] = cm[pixels[6] + block[6]];
 674         pixels[7] = cm[pixels[7] + block[7]];
 675         pixels += line_size;
 676         block += 8;
 677     }
 678 }
 679
 680 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 681                           int line_size)
 682 {
 683     int i;
 684     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 685
 686     /* read the pixels */
 687     for(i=0;i<4;i++) {
 688         pixels[0] = cm[pixels[0] + block[0]];
 689         pixels[1] = cm[pixels[1] + block[1]];
 690         pixels[2] = cm[pixels[2] + block[2]];
 691         pixels[3] = cm[pixels[3] + block[3]];
 692         pixels += line_size;
 693         block += 8;
 694     }
 695 }
 696
 697 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 698                           int line_size)
 699 {
 700     int i;
 701     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 702
 703     /* read the pixels */
 704     for(i=0;i<2;i++) {
 705         pixels[0] = cm[pixels[0] + block[0]];
 706         pixels[1] = cm[pixels[1] + block[1]];
 707         pixels += line_size;
 708         block += 8;
 709     }
 710 }
 711
 712 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 713 {
 714     int i;
 715     for(i=0;i<8;i++) {
 716         pixels[0] += block[0];
 717         pixels[1] += block[1];
 718         pixels[2] += block[2];
 719         pixels[3] += block[3];
 720         pixels[4] += block[4];
 721         pixels[5] += block[5];
 722         pixels[6] += block[6];
 723         pixels[7] += block[7];
 724         pixels += line_size;
 725         block += 8;
 726     }
 727 }
 728
 729 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 730 {
 731     int i;
 732     for(i=0;i<4;i++) {
 733         pixels[0] += block[0];
 734         pixels[1] += block[1];
 735         pixels[2] += block[2];
 736         pixels[3] += block[3];
 737         pixels += line_size;
 738         block += 4;
 739     }
 740 }
 741
 742 static int sum_abs_dctelem_c(DCTELEM *block)
 743 {
 744     int sum=0, i;
 745     for(i=0; i<64; i++)
 746         sum+= FFABS(block[i]);
 747     return sum;
 748 }
 749
 750 #if 0
 751
 752 #define PIXOP2(OPNAME, OP) \
 753 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 754 {\
 755     int i;\
 756     for(i=0; i<h; i++){\
 757         OP(*((uint64_t*)block), AV_RN64(pixels));\
 758         pixels+=line_size;\
 759         block +=line_size;\
 760     }\
 761 }\
 762 \
 763 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 764 {\
 765     int i;\
 766     for(i=0; i<h; i++){\
 767         const uint64_t a= AV_RN64(pixels  );\
 768         const uint64_t b= AV_RN64(pixels+1);\
 769         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 770         pixels+=line_size;\
 771         block +=line_size;\
 772     }\
 773 }\
 774 \
 775 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 776 {\
 777     int i;\
 778     for(i=0; i<h; i++){\
 779         const uint64_t a= AV_RN64(pixels  );\
 780         const uint64_t b= AV_RN64(pixels+1);\
 781         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 782         pixels+=line_size;\
 783         block +=line_size;\
 784     }\
 785 }\
 786 \
 787 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 788 {\
 789     int i;\
 790     for(i=0; i<h; i++){\
 791         const uint64_t a= AV_RN64(pixels          );\
 792         const uint64_t b= AV_RN64(pixels+line_size);\
 793         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 794         pixels+=line_size;\
 795         block +=line_size;\
 796     }\
 797 }\
 798 \
 799 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 800 {\
 801     int i;\
 802     for(i=0; i<h; i++){\
 803         const uint64_t a= AV_RN64(pixels          );\
 804         const uint64_t b= AV_RN64(pixels+line_size);\
 805         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 806         pixels+=line_size;\
 807         block +=line_size;\
 808     }\
 809 }\
 810 \
 811 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 812 {\
 813         int i;\
 814         const uint64_t a= AV_RN64(pixels  );\
 815         const uint64_t b= AV_RN64(pixels+1);\
 816         uint64_t l0=  (a&0x0303030303030303ULL)\
 817                     + (b&0x0303030303030303ULL)\
 818                     + 0x0202020202020202ULL;\
 819         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 820                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 821         uint64_t l1,h1;\
 822 \
 823         pixels+=line_size;\
 824         for(i=0; i<h; i+=2){\
 825             uint64_t a= AV_RN64(pixels  );\
 826             uint64_t b= AV_RN64(pixels+1);\
 827             l1=  (a&0x0303030303030303ULL)\
 828                + (b&0x0303030303030303ULL);\
 829             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 830               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 831             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 832             pixels+=line_size;\
 833             block +=line_size;\
 834             a= AV_RN64(pixels  );\
 835             b= AV_RN64(pixels+1);\
 836             l0=  (a&0x0303030303030303ULL)\
 837                + (b&0x0303030303030303ULL)\
 838                + 0x0202020202020202ULL;\
 839             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 840               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 841             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 842             pixels+=line_size;\
 843             block +=line_size;\
 844         }\
 845 }\
 846 \
 847 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 848 {\
 849         int i;\
 850         const uint64_t a= AV_RN64(pixels  );\
 851         const uint64_t b= AV_RN64(pixels+1);\
 852         uint64_t l0=  (a&0x0303030303030303ULL)\
 853                     + (b&0x0303030303030303ULL)\
 854                     + 0x0101010101010101ULL;\
 855         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 856                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 857         uint64_t l1,h1;\
 858 \
 859         pixels+=line_size;\
 860         for(i=0; i<h; i+=2){\
 861             uint64_t a= AV_RN64(pixels  );\
 862             uint64_t b= AV_RN64(pixels+1);\
 863             l1=  (a&0x0303030303030303ULL)\
 864                + (b&0x0303030303030303ULL);\
 865             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 866               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 867             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 868             pixels+=line_size;\
 869             block +=line_size;\
 870             a= AV_RN64(pixels  );\
 871             b= AV_RN64(pixels+1);\
 872             l0=  (a&0x0303030303030303ULL)\
 873                + (b&0x0303030303030303ULL)\
 874                + 0x0101010101010101ULL;\
 875             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 876               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 877             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 878             pixels+=line_size;\
 879             block +=line_size;\
 880         }\
 881 }\
 882 \
 883 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 884 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 885 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 886 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 887 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 888 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 889 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 890
 891 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 892 #else // 64 bit variant
 893
 894 #define PIXOP2(OPNAME, OP) \
 895 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 896     int i;\
 897     for(i=0; i<h; i++){\
 898         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 899         pixels+=line_size;\
 900         block +=line_size;\
 901     }\
 902 }\
 903 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 904     int i;\
 905     for(i=0; i<h; i++){\
 906         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 907         pixels+=line_size;\
 908         block +=line_size;\
 909     }\
 910 }\
 911 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 912     int i;\
 913     for(i=0; i<h; i++){\
 914         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 915         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 916         pixels+=line_size;\
 917         block +=line_size;\
 918     }\
 919 }\
 920 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 921     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 922 }\
 923 \
 924 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 925                                                 int src_stride1, int src_stride2, int h){\
 926     int i;\
 927     for(i=0; i<h; i++){\
 928         uint32_t a,b;\
 929         a= AV_RN32(&src1[i*src_stride1  ]);\
 930         b= AV_RN32(&src2[i*src_stride2  ]);\
 931         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 932         a= AV_RN32(&src1[i*src_stride1+4]);\
 933         b= AV_RN32(&src2[i*src_stride2+4]);\
 934         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 935     }\
 936 }\
 937 \
 938 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 939                                                 int src_stride1, int src_stride2, int h){\
 940     int i;\
 941     for(i=0; i<h; i++){\
 942         uint32_t a,b;\
 943         a= AV_RN32(&src1[i*src_stride1  ]);\
 944         b= AV_RN32(&src2[i*src_stride2  ]);\
 945         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 946         a= AV_RN32(&src1[i*src_stride1+4]);\
 947         b= AV_RN32(&src2[i*src_stride2+4]);\
 948         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 949     }\
 950 }\
 951 \
 952 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 953                                                 int src_stride1, int src_stride2, int h){\
 954     int i;\
 955     for(i=0; i<h; i++){\
 956         uint32_t a,b;\
 957         a= AV_RN32(&src1[i*src_stride1  ]);\
 958         b= AV_RN32(&src2[i*src_stride2  ]);\
 959         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 960     }\
 961 }\
 962 \
 963 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 964                                                 int src_stride1, int src_stride2, int h){\
 965     int i;\
 966     for(i=0; i<h; i++){\
 967         uint32_t a,b;\
 968         a= AV_RN16(&src1[i*src_stride1  ]);\
 969         b= AV_RN16(&src2[i*src_stride2  ]);\
 970         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 971     }\
 972 }\
 973 \
 974 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 975                                                 int src_stride1, int src_stride2, int h){\
 976     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 977     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 978 }\
 979 \
 980 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 981                                                 int src_stride1, int src_stride2, int h){\
 982     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 983     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 984 }\
 985 \
 986 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 987     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 988 }\
 989 \
 990 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 991     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 992 }\
 993 \
 994 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 995     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 996 }\
 997 \
 998 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 999     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1000 }\
1001 \
1002 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1003                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1004     int i;\
1005     for(i=0; i<h; i++){\
1006         uint32_t a, b, c, d, l0, l1, h0, h1;\
1007         a= AV_RN32(&src1[i*src_stride1]);\
1008         b= AV_RN32(&src2[i*src_stride2]);\
1009         c= AV_RN32(&src3[i*src_stride3]);\
1010         d= AV_RN32(&src4[i*src_stride4]);\
1011         l0=  (a&0x03030303UL)\
1012            + (b&0x03030303UL)\
1013            + 0x02020202UL;\
1014         h0= ((a&0xFCFCFCFCUL)>>2)\
1015           + ((b&0xFCFCFCFCUL)>>2);\
1016         l1=  (c&0x03030303UL)\
1017            + (d&0x03030303UL);\
1018         h1= ((c&0xFCFCFCFCUL)>>2)\
1019           + ((d&0xFCFCFCFCUL)>>2);\
1020         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1021         a= AV_RN32(&src1[i*src_stride1+4]);\
1022         b= AV_RN32(&src2[i*src_stride2+4]);\
1023         c= AV_RN32(&src3[i*src_stride3+4]);\
1024         d= AV_RN32(&src4[i*src_stride4+4]);\
1025         l0=  (a&0x03030303UL)\
1026            + (b&0x03030303UL)\
1027            + 0x02020202UL;\
1028         h0= ((a&0xFCFCFCFCUL)>>2)\
1029           + ((b&0xFCFCFCFCUL)>>2);\
1030         l1=  (c&0x03030303UL)\
1031            + (d&0x03030303UL);\
1032         h1= ((c&0xFCFCFCFCUL)>>2)\
1033           + ((d&0xFCFCFCFCUL)>>2);\
1034         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1035     }\
1036 }\
1037 \
1038 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1039     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1040 }\
1041 \
1042 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1043     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1044 }\
1045 \
1046 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1047     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1048 }\
1049 \
1050 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1051     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1052 }\
1053 \
1054 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1055                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1056     int i;\
1057     for(i=0; i<h; i++){\
1058         uint32_t a, b, c, d, l0, l1, h0, h1;\
1059         a= AV_RN32(&src1[i*src_stride1]);\
1060         b= AV_RN32(&src2[i*src_stride2]);\
1061         c= AV_RN32(&src3[i*src_stride3]);\
1062         d= AV_RN32(&src4[i*src_stride4]);\
1063         l0=  (a&0x03030303UL)\
1064            + (b&0x03030303UL)\
1065            + 0x01010101UL;\
1066         h0= ((a&0xFCFCFCFCUL)>>2)\
1067           + ((b&0xFCFCFCFCUL)>>2);\
1068         l1=  (c&0x03030303UL)\
1069            + (d&0x03030303UL);\
1070         h1= ((c&0xFCFCFCFCUL)>>2)\
1071           + ((d&0xFCFCFCFCUL)>>2);\
1072         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1073         a= AV_RN32(&src1[i*src_stride1+4]);\
1074         b= AV_RN32(&src2[i*src_stride2+4]);\
1075         c= AV_RN32(&src3[i*src_stride3+4]);\
1076         d= AV_RN32(&src4[i*src_stride4+4]);\
1077         l0=  (a&0x03030303UL)\
1078            + (b&0x03030303UL)\
1079            + 0x01010101UL;\
1080         h0= ((a&0xFCFCFCFCUL)>>2)\
1081           + ((b&0xFCFCFCFCUL)>>2);\
1082         l1=  (c&0x03030303UL)\
1083            + (d&0x03030303UL);\
1084         h1= ((c&0xFCFCFCFCUL)>>2)\
1085           + ((d&0xFCFCFCFCUL)>>2);\
1086         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1087     }\
1088 }\
1089 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1090                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1091     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1092     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1093 }\
1094 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1095                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1096     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1097     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1098 }\
1099 \
1100 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1101 {\
1102         int i, a0, b0, a1, b1;\
1103         a0= pixels[0];\
1104         b0= pixels[1] + 2;\
1105         a0 += b0;\
1106         b0 += pixels[2];\
1107 \
1108         pixels+=line_size;\
1109         for(i=0; i<h; i+=2){\
1110             a1= pixels[0];\
1111             b1= pixels[1];\
1112             a1 += b1;\
1113             b1 += pixels[2];\
1114 \
1115             block[0]= (a1+a0)>>2; /* FIXME non put */\
1116             block[1]= (b1+b0)>>2;\
1117 \
1118             pixels+=line_size;\
1119             block +=line_size;\
1120 \
1121             a0= pixels[0];\
1122             b0= pixels[1] + 2;\
1123             a0 += b0;\
1124             b0 += pixels[2];\
1125 \
1126             block[0]= (a1+a0)>>2;\
1127             block[1]= (b1+b0)>>2;\
1128             pixels+=line_size;\
1129             block +=line_size;\
1130         }\
1131 }\
1132 \
1133 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1134 {\
1135         int i;\
1136         const uint32_t a= AV_RN32(pixels  );\
1137         const uint32_t b= AV_RN32(pixels+1);\
1138         uint32_t l0=  (a&0x03030303UL)\
1139                     + (b&0x03030303UL)\
1140                     + 0x02020202UL;\
1141         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1142                    + ((b&0xFCFCFCFCUL)>>2);\
1143         uint32_t l1,h1;\
1144 \
1145         pixels+=line_size;\
1146         for(i=0; i<h; i+=2){\
1147             uint32_t a= AV_RN32(pixels  );\
1148             uint32_t b= AV_RN32(pixels+1);\
1149             l1=  (a&0x03030303UL)\
1150                + (b&0x03030303UL);\
1151             h1= ((a&0xFCFCFCFCUL)>>2)\
1152               + ((b&0xFCFCFCFCUL)>>2);\
1153             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1154             pixels+=line_size;\
1155             block +=line_size;\
1156             a= AV_RN32(pixels  );\
1157             b= AV_RN32(pixels+1);\
1158             l0=  (a&0x03030303UL)\
1159                + (b&0x03030303UL)\
1160                + 0x02020202UL;\
1161             h0= ((a&0xFCFCFCFCUL)>>2)\
1162               + ((b&0xFCFCFCFCUL)>>2);\
1163             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1164             pixels+=line_size;\
1165             block +=line_size;\
1166         }\
1167 }\
1168 \
1169 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1170 {\
1171     int j;\
1172     for(j=0; j<2; j++){\
1173         int i;\
1174         const uint32_t a= AV_RN32(pixels  );\
1175         const uint32_t b= AV_RN32(pixels+1);\
1176         uint32_t l0=  (a&0x03030303UL)\
1177                     + (b&0x03030303UL)\
1178                     + 0x02020202UL;\
1179         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1180                    + ((b&0xFCFCFCFCUL)>>2);\
1181         uint32_t l1,h1;\
1182 \
1183         pixels+=line_size;\
1184         for(i=0; i<h; i+=2){\
1185             uint32_t a= AV_RN32(pixels  );\
1186             uint32_t b= AV_RN32(pixels+1);\
1187             l1=  (a&0x03030303UL)\
1188                + (b&0x03030303UL);\
1189             h1= ((a&0xFCFCFCFCUL)>>2)\
1190               + ((b&0xFCFCFCFCUL)>>2);\
1191             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1192             pixels+=line_size;\
1193             block +=line_size;\
1194             a= AV_RN32(pixels  );\
1195             b= AV_RN32(pixels+1);\
1196             l0=  (a&0x03030303UL)\
1197                + (b&0x03030303UL)\
1198                + 0x02020202UL;\
1199             h0= ((a&0xFCFCFCFCUL)>>2)\
1200               + ((b&0xFCFCFCFCUL)>>2);\
1201             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1202             pixels+=line_size;\
1203             block +=line_size;\
1204         }\
1205         pixels+=4-line_size*(h+1);\
1206         block +=4-line_size*h;\
1207     }\
1208 }\
1209 \
1210 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1211 {\
1212     int j;\
1213     for(j=0; j<2; j++){\
1214         int i;\
1215         const uint32_t a= AV_RN32(pixels  );\
1216         const uint32_t b= AV_RN32(pixels+1);\
1217         uint32_t l0=  (a&0x03030303UL)\
1218                     + (b&0x03030303UL)\
1219                     + 0x01010101UL;\
1220         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1221                    + ((b&0xFCFCFCFCUL)>>2);\
1222         uint32_t l1,h1;\
1223 \
1224         pixels+=line_size;\
1225         for(i=0; i<h; i+=2){\
1226             uint32_t a= AV_RN32(pixels  );\
1227             uint32_t b= AV_RN32(pixels+1);\
1228             l1=  (a&0x03030303UL)\
1229                + (b&0x03030303UL);\
1230             h1= ((a&0xFCFCFCFCUL)>>2)\
1231               + ((b&0xFCFCFCFCUL)>>2);\
1232             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1233             pixels+=line_size;\
1234             block +=line_size;\
1235             a= AV_RN32(pixels  );\
1236             b= AV_RN32(pixels+1);\
1237             l0=  (a&0x03030303UL)\
1238                + (b&0x03030303UL)\
1239                + 0x01010101UL;\
1240             h0= ((a&0xFCFCFCFCUL)>>2)\
1241               + ((b&0xFCFCFCFCUL)>>2);\
1242             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1243             pixels+=line_size;\
1244             block +=line_size;\
1245         }\
1246         pixels+=4-line_size*(h+1);\
1247         block +=4-line_size*h;\
1248     }\
1249 }\
1250 \
1251 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1252 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1253 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1254 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1255 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1256 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1257 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1258 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1259
1260 #define op_avg(a, b) a = rnd_avg32(a, b)
1261 #endif
1262 #define op_put(a, b) a = b
1263
1264 PIXOP2(avg, op_avg)
1265 PIXOP2(put, op_put)
1266 #undef op_avg
1267 #undef op_put
1268
1269 #define avg2(a,b) ((a+b+1)>>1)
1270 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1271
1272 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1273     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1274 }
1275
1276 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1277     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1278 }
1279
1280 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1281 {
1282     const int A=(16-x16)*(16-y16);
1283     const int B=(   x16)*(16-y16);
1284     const int C=(16-x16)*(   y16);
1285     const int D=(   x16)*(   y16);
1286     int i;
1287
1288     for(i=0; i<h; i++)
1289     {
1290         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1291         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1292         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1293         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1294         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1295         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1296         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1297         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1298         dst+= stride;
1299         src+= stride;
1300     }
1301 }
1302
1303 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1304                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1305 {
1306     int y, vx, vy;
1307     const int s= 1<<shift;
1308
1309     width--;
1310     height--;
1311
1312     for(y=0; y<h; y++){
1313         int x;
1314
1315         vx= ox;
1316         vy= oy;
1317         for(x=0; x<8; x++){ //XXX FIXME optimize
1318             int src_x, src_y, frac_x, frac_y, index;
1319
1320             src_x= vx>>16;
1321             src_y= vy>>16;
1322             frac_x= src_x&(s-1);
1323             frac_y= src_y&(s-1);
1324             src_x>>=shift;
1325             src_y>>=shift;
1326
1327             if((unsigned)src_x < width){
1328                 if((unsigned)src_y < height){
1329                     index= src_x + src_y*stride;
1330                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1331                                            + src[index       +1]*   frac_x )*(s-frac_y)
1332                                         + (  src[index+stride  ]*(s-frac_x)
1333                                            + src[index+stride+1]*   frac_x )*   frac_y
1334                                         + r)>>(shift*2);
1335                 }else{
1336                     index= src_x + av_clip(src_y, 0, height)*stride;
1337                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1338                                           + src[index       +1]*   frac_x )*s
1339                                         + r)>>(shift*2);
1340                 }
1341             }else{
1342                 if((unsigned)src_y < height){
1343                     index= av_clip(src_x, 0, width) + src_y*stride;
1344                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1345                                            + src[index+stride  ]*   frac_y )*s
1346                                         + r)>>(shift*2);
1347                 }else{
1348                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1349                     dst[y*stride + x]=    src[index         ];
1350                 }
1351             }
1352
1353             vx+= dxx;
1354             vy+= dyx;
1355         }
1356         ox += dxy;
1357         oy += dyy;
1358     }
1359 }
1360
1361 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1362     switch(width){
1363     case 2: put_pixels2_c (dst, src, stride, height); break;
1364     case 4: put_pixels4_c (dst, src, stride, height); break;
1365     case 8: put_pixels8_c (dst, src, stride, height); break;
1366     case 16:put_pixels16_c(dst, src, stride, height); break;
1367     }
1368 }
1369
1370 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1371     int i,j;
1372     for (i=0; i < height; i++) {
1373       for (j=0; j < width; j++) {
1374         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1375       }
1376       src += stride;
1377       dst += stride;
1378     }
1379 }
1380
1381 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1382     int i,j;
1383     for (i=0; i < height; i++) {
1384       for (j=0; j < width; j++) {
1385         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1386       }
1387       src += stride;
1388       dst += stride;
1389     }
1390 }
1391
1392 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1393     int i,j;
1394     for (i=0; i < height; i++) {
1395       for (j=0; j < width; j++) {
1396         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1397       }
1398       src += stride;
1399       dst += stride;
1400     }
1401 }
1402
1403 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1404     int i,j;
1405     for (i=0; i < height; i++) {
1406       for (j=0; j < width; j++) {
1407         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1408       }
1409       src += stride;
1410       dst += stride;
1411     }
1412 }
1413
1414 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1415     int i,j;
1416     for (i=0; i < height; i++) {
1417       for (j=0; j < width; j++) {
1418         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1419       }
1420       src += stride;
1421       dst += stride;
1422     }
1423 }
1424
1425 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1426     int i,j;
1427     for (i=0; i < height; i++) {
1428       for (j=0; j < width; j++) {
1429         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1430       }
1431       src += stride;
1432       dst += stride;
1433     }
1434 }
1435
1436 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1437     int i,j;
1438     for (i=0; i < height; i++) {
1439       for (j=0; j < width; j++) {
1440         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1441       }
1442       src += stride;
1443       dst += stride;
1444     }
1445 }
1446
1447 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1448     int i,j;
1449     for (i=0; i < height; i++) {
1450       for (j=0; j < width; j++) {
1451         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1452       }
1453       src += stride;
1454       dst += stride;
1455     }
1456 }
1457
1458 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1459     switch(width){
1460     case 2: avg_pixels2_c (dst, src, stride, height); break;
1461     case 4: avg_pixels4_c (dst, src, stride, height); break;
1462     case 8: avg_pixels8_c (dst, src, stride, height); break;
1463     case 16:avg_pixels16_c(dst, src, stride, height); break;
1464     }
1465 }
1466
1467 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1468     int i,j;
1469     for (i=0; i < height; i++) {
1470       for (j=0; j < width; j++) {
1471         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1472       }
1473       src += stride;
1474       dst += stride;
1475     }
1476 }
1477
1478 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1479     int i,j;
1480     for (i=0; i < height; i++) {
1481       for (j=0; j < width; j++) {
1482         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1483       }
1484       src += stride;
1485       dst += stride;
1486     }
1487 }
1488
1489 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1490     int i,j;
1491     for (i=0; i < height; i++) {
1492       for (j=0; j < width; j++) {
1493         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1494       }
1495       src += stride;
1496       dst += stride;
1497     }
1498 }
1499
1500 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1501     int i,j;
1502     for (i=0; i < height; i++) {
1503       for (j=0; j < width; j++) {
1504         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1505       }
1506       src += stride;
1507       dst += stride;
1508     }
1509 }
1510
1511 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1512     int i,j;
1513     for (i=0; i < height; i++) {
1514       for (j=0; j < width; j++) {
1515         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1516       }
1517       src += stride;
1518       dst += stride;
1519     }
1520 }
1521
1522 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1523     int i,j;
1524     for (i=0; i < height; i++) {
1525       for (j=0; j < width; j++) {
1526         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1527       }
1528       src += stride;
1529       dst += stride;
1530     }
1531 }
1532
1533 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1534     int i,j;
1535     for (i=0; i < height; i++) {
1536       for (j=0; j < width; j++) {
1537         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1538       }
1539       src += stride;
1540       dst += stride;
1541     }
1542 }
1543
1544 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1545     int i,j;
1546     for (i=0; i < height; i++) {
1547       for (j=0; j < width; j++) {
1548         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1549       }
1550       src += stride;
1551       dst += stride;
1552     }
1553 }
1554 #if 0
1555 #define TPEL_WIDTH(width)\
1556 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1557     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1558 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1559     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1560 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1561     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1562 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1563     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1564 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1565     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1566 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1567     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1568 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1569     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1570 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1571     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1572 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1573     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1574 #endif
1575
1576 #define H264_CHROMA_MC(OPNAME, OP)\
1577 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1578     const int A=(8-x)*(8-y);\
1579     const int B=(  x)*(8-y);\
1580     const int C=(8-x)*(  y);\
1581     const int D=(  x)*(  y);\
1582     int i;\
1583     \
1584     assert(x<8 && y<8 && x>=0 && y>=0);\
1585 \
1586     if(D){\
1587         for(i=0; i<h; i++){\
1588             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1589             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1590             dst+= stride;\
1591             src+= stride;\
1592         }\
1593     }else{\
1594         const int E= B+C;\
1595         const int step= C ? stride : 1;\
1596         for(i=0; i<h; i++){\
1597             OP(dst[0], (A*src[0] + E*src[step+0]));\
1598             OP(dst[1], (A*src[1] + E*src[step+1]));\
1599             dst+= stride;\
1600             src+= stride;\
1601         }\
1602     }\
1603 }\
1604 \
1605 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1606     const int A=(8-x)*(8-y);\
1607     const int B=(  x)*(8-y);\
1608     const int C=(8-x)*(  y);\
1609     const int D=(  x)*(  y);\
1610     int i;\
1611     \
1612     assert(x<8 && y<8 && x>=0 && y>=0);\
1613 \
1614     if(D){\
1615         for(i=0; i<h; i++){\
1616             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1617             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1618             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1619             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1620             dst+= stride;\
1621             src+= stride;\
1622         }\
1623     }else{\
1624         const int E= B+C;\
1625         const int step= C ? stride : 1;\
1626         for(i=0; i<h; i++){\
1627             OP(dst[0], (A*src[0] + E*src[step+0]));\
1628             OP(dst[1], (A*src[1] + E*src[step+1]));\
1629             OP(dst[2], (A*src[2] + E*src[step+2]));\
1630             OP(dst[3], (A*src[3] + E*src[step+3]));\
1631             dst+= stride;\
1632             src+= stride;\
1633         }\
1634     }\
1635 }\
1636 \
1637 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1638     const int A=(8-x)*(8-y);\
1639     const int B=(  x)*(8-y);\
1640     const int C=(8-x)*(  y);\
1641     const int D=(  x)*(  y);\
1642     int i;\
1643     \
1644     assert(x<8 && y<8 && x>=0 && y>=0);\
1645 \
1646     if(D){\
1647         for(i=0; i<h; i++){\
1648             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1649             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1650             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1651             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1652             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1653             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1654             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1655             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1656             dst+= stride;\
1657             src+= stride;\
1658         }\
1659     }else{\
1660         const int E= B+C;\
1661         const int step= C ? stride : 1;\
1662         for(i=0; i<h; i++){\
1663             OP(dst[0], (A*src[0] + E*src[step+0]));\
1664             OP(dst[1], (A*src[1] + E*src[step+1]));\
1665             OP(dst[2], (A*src[2] + E*src[step+2]));\
1666             OP(dst[3], (A*src[3] + E*src[step+3]));\
1667             OP(dst[4], (A*src[4] + E*src[step+4]));\
1668             OP(dst[5], (A*src[5] + E*src[step+5]));\
1669             OP(dst[6], (A*src[6] + E*src[step+6]));\
1670             OP(dst[7], (A*src[7] + E*src[step+7]));\
1671             dst+= stride;\
1672             src+= stride;\
1673         }\
1674     }\
1675 }
1676
1677 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1678 #define op_put(a, b) a = (((b) + 32)>>6)
1679
1680 H264_CHROMA_MC(put_       , op_put)
1681 H264_CHROMA_MC(avg_       , op_avg)
1682 #undef op_avg
1683 #undef op_put
1684
1685 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1686     const int A=(8-x)*(8-y);
1687     const int B=(  x)*(8-y);
1688     const int C=(8-x)*(  y);
1689     const int D=(  x)*(  y);
1690     int i;
1691
1692     assert(x<8 && y<8 && x>=0 && y>=0);
1693
1694     for(i=0; i<h; i++)
1695     {
1696         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1697         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1698         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1699         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1700         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1701         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1702         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1703         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1704         dst+= stride;
1705         src+= stride;
1706     }
1707 }
1708
1709 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1710     const int A=(8-x)*(8-y);
1711     const int B=(  x)*(8-y);
1712     const int C=(8-x)*(  y);
1713     const int D=(  x)*(  y);
1714     int i;
1715
1716     assert(x<8 && y<8 && x>=0 && y>=0);
1717
1718     for(i=0; i<h; i++)
1719     {
1720         dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1721         dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1722         dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1723         dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1724         dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1725         dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1726         dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1727         dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1728         dst+= stride;
1729         src+= stride;
1730     }
1731 }
1732
1733 #define QPEL_MC(r, OPNAME, RND, OP) \
1734 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1735     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1736     int i;\
1737     for(i=0; i<h; i++)\
1738     {\
1739         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1740         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1741         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1742         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1743         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1744         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1745         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1746         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1747         dst+=dstStride;\
1748         src+=srcStride;\
1749     }\
1750 }\
1751 \
1752 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1753     const int w=8;\
1754     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1755     int i;\
1756     for(i=0; i<w; i++)\
1757     {\
1758         const int src0= src[0*srcStride];\
1759         const int src1= src[1*srcStride];\
1760         const int src2= src[2*srcStride];\
1761         const int src3= src[3*srcStride];\
1762         const int src4= src[4*srcStride];\
1763         const int src5= src[5*srcStride];\
1764         const int src6= src[6*srcStride];\
1765         const int src7= src[7*srcStride];\
1766         const int src8= src[8*srcStride];\
1767         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1768         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1769         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1770         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1771         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1772         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1773         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1774         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1775         dst++;\
1776         src++;\
1777     }\
1778 }\
1779 \
1780 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1781     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1782     int i;\
1783     \
1784     for(i=0; i<h; i++)\
1785     {\
1786         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1787         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1788         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1789         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1790         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1791         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1792         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1793         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1794         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1795         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1796         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1797         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1798         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1799         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1800         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1801         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1802         dst+=dstStride;\
1803         src+=srcStride;\
1804     }\
1805 }\
1806 \
1807 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1808     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1809     int i;\
1810     const int w=16;\
1811     for(i=0; i<w; i++)\
1812     {\
1813         const int src0= src[0*srcStride];\
1814         const int src1= src[1*srcStride];\
1815         const int src2= src[2*srcStride];\
1816         const int src3= src[3*srcStride];\
1817         const int src4= src[4*srcStride];\
1818         const int src5= src[5*srcStride];\
1819         const int src6= src[6*srcStride];\
1820         const int src7= src[7*srcStride];\
1821         const int src8= src[8*srcStride];\
1822         const int src9= src[9*srcStride];\
1823         const int src10= src[10*srcStride];\
1824         const int src11= src[11*srcStride];\
1825         const int src12= src[12*srcStride];\
1826         const int src13= src[13*srcStride];\
1827         const int src14= src[14*srcStride];\
1828         const int src15= src[15*srcStride];\
1829         const int src16= src[16*srcStride];\
1830         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1831         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1832         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1833         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1834         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1835         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1836         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1837         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1838         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1839         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1840         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1841         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1842         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1843         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1844         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1845         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1846         dst++;\
1847         src++;\
1848     }\
1849 }\
1850 \
1851 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1852     OPNAME ## pixels8_c(dst, src, stride, 8);\
1853 }\
1854 \
1855 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1856     uint8_t half[64];\
1857     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1858     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1859 }\
1860 \
1861 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1862     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1863 }\
1864 \
1865 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1866     uint8_t half[64];\
1867     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1868     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1869 }\
1870 \
1871 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1872     uint8_t full[16*9];\
1873     uint8_t half[64];\
1874     copy_block9(full, src, 16, stride, 9);\
1875     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1876     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1877 }\
1878 \
1879 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1880     uint8_t full[16*9];\
1881     copy_block9(full, src, 16, stride, 9);\
1882     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1883 }\
1884 \
1885 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1886     uint8_t full[16*9];\
1887     uint8_t half[64];\
1888     copy_block9(full, src, 16, stride, 9);\
1889     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1890     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1891 }\
1892 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1893     uint8_t full[16*9];\
1894     uint8_t halfH[72];\
1895     uint8_t halfV[64];\
1896     uint8_t halfHV[64];\
1897     copy_block9(full, src, 16, stride, 9);\
1898     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1899     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1900     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1901     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1902 }\
1903 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1904     uint8_t full[16*9];\
1905     uint8_t halfH[72];\
1906     uint8_t halfHV[64];\
1907     copy_block9(full, src, 16, stride, 9);\
1908     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1909     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1910     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1911     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1912 }\
1913 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1914     uint8_t full[16*9];\
1915     uint8_t halfH[72];\
1916     uint8_t halfV[64];\
1917     uint8_t halfHV[64];\
1918     copy_block9(full, src, 16, stride, 9);\
1919     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1920     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1921     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1922     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1923 }\
1924 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1925     uint8_t full[16*9];\
1926     uint8_t halfH[72];\
1927     uint8_t halfHV[64];\
1928     copy_block9(full, src, 16, stride, 9);\
1929     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1930     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1931     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1932     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1933 }\
1934 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1935     uint8_t full[16*9];\
1936     uint8_t halfH[72];\
1937     uint8_t halfV[64];\
1938     uint8_t halfHV[64];\
1939     copy_block9(full, src, 16, stride, 9);\
1940     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1941     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1942     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1943     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1944 }\
1945 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1946     uint8_t full[16*9];\
1947     uint8_t halfH[72];\
1948     uint8_t halfHV[64];\
1949     copy_block9(full, src, 16, stride, 9);\
1950     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1951     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1952     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1953     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1954 }\
1955 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956     uint8_t full[16*9];\
1957     uint8_t halfH[72];\
1958     uint8_t halfV[64];\
1959     uint8_t halfHV[64];\
1960     copy_block9(full, src, 16, stride, 9);\
1961     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1962     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1963     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1964     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1965 }\
1966 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1967     uint8_t full[16*9];\
1968     uint8_t halfH[72];\
1969     uint8_t halfHV[64];\
1970     copy_block9(full, src, 16, stride, 9);\
1971     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1972     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1973     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1974     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1975 }\
1976 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1977     uint8_t halfH[72];\
1978     uint8_t halfHV[64];\
1979     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1980     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1981     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1982 }\
1983 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1984     uint8_t halfH[72];\
1985     uint8_t halfHV[64];\
1986     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1987     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1988     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1989 }\
1990 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1991     uint8_t full[16*9];\
1992     uint8_t halfH[72];\
1993     uint8_t halfV[64];\
1994     uint8_t halfHV[64];\
1995     copy_block9(full, src, 16, stride, 9);\
1996     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1997     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1998     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1999     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2000 }\
2001 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2002     uint8_t full[16*9];\
2003     uint8_t halfH[72];\
2004     copy_block9(full, src, 16, stride, 9);\
2005     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2006     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2007     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2008 }\
2009 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2010     uint8_t full[16*9];\
2011     uint8_t halfH[72];\
2012     uint8_t halfV[64];\
2013     uint8_t halfHV[64];\
2014     copy_block9(full, src, 16, stride, 9);\
2015     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2016     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2017     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2018     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2019 }\
2020 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2021     uint8_t full[16*9];\
2022     uint8_t halfH[72];\
2023     copy_block9(full, src, 16, stride, 9);\
2024     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2025     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2026     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2027 }\
2028 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2029     uint8_t halfH[72];\
2030     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2031     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2032 }\
2033 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2034     OPNAME ## pixels16_c(dst, src, stride, 16);\
2035 }\
2036 \
2037 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2038     uint8_t half[256];\
2039     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2040     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2041 }\
2042 \
2043 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2044     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2045 }\
2046 \
2047 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2048     uint8_t half[256];\
2049     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2050     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2051 }\
2052 \
2053 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2054     uint8_t full[24*17];\
2055     uint8_t half[256];\
2056     copy_block17(full, src, 24, stride, 17);\
2057     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2058     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2059 }\
2060 \
2061 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2062     uint8_t full[24*17];\
2063     copy_block17(full, src, 24, stride, 17);\
2064     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2065 }\
2066 \
2067 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2068     uint8_t full[24*17];\
2069     uint8_t half[256];\
2070     copy_block17(full, src, 24, stride, 17);\
2071     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2072     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2073 }\
2074 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2075     uint8_t full[24*17];\
2076     uint8_t halfH[272];\
2077     uint8_t halfV[256];\
2078     uint8_t halfHV[256];\
2079     copy_block17(full, src, 24, stride, 17);\
2080     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2081     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2082     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2083     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2084 }\
2085 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2086     uint8_t full[24*17];\
2087     uint8_t halfH[272];\
2088     uint8_t halfHV[256];\
2089     copy_block17(full, src, 24, stride, 17);\
2090     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2091     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2092     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2093     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2094 }\
2095 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2096     uint8_t full[24*17];\
2097     uint8_t halfH[272];\
2098     uint8_t halfV[256];\
2099     uint8_t halfHV[256];\
2100     copy_block17(full, src, 24, stride, 17);\
2101     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2102     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2103     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2104     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2105 }\
2106 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2107     uint8_t full[24*17];\
2108     uint8_t halfH[272];\
2109     uint8_t halfHV[256];\
2110     copy_block17(full, src, 24, stride, 17);\
2111     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2112     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2113     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2114     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2115 }\
2116 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2117     uint8_t full[24*17];\
2118     uint8_t halfH[272];\
2119     uint8_t halfV[256];\
2120     uint8_t halfHV[256];\
2121     copy_block17(full, src, 24, stride, 17);\
2122     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2123     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2124     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2125     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2126 }\
2127 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2128     uint8_t full[24*17];\
2129     uint8_t halfH[272];\
2130     uint8_t halfHV[256];\
2131     copy_block17(full, src, 24, stride, 17);\
2132     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2133     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2134     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2135     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2136 }\
2137 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2138     uint8_t full[24*17];\
2139     uint8_t halfH[272];\
2140     uint8_t halfV[256];\
2141     uint8_t halfHV[256];\
2142     copy_block17(full, src, 24, stride, 17);\
2143     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2144     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2145     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2146     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2147 }\
2148 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2149     uint8_t full[24*17];\
2150     uint8_t halfH[272];\
2151     uint8_t halfHV[256];\
2152     copy_block17(full, src, 24, stride, 17);\
2153     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2154     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2155     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2156     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2157 }\
2158 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2159     uint8_t halfH[272];\
2160     uint8_t halfHV[256];\
2161     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2162     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2163     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2164 }\
2165 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2166     uint8_t halfH[272];\
2167     uint8_t halfHV[256];\
2168     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2169     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2170     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2171 }\
2172 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2173     uint8_t full[24*17];\
2174     uint8_t halfH[272];\
2175     uint8_t halfV[256];\
2176     uint8_t halfHV[256];\
2177     copy_block17(full, src, 24, stride, 17);\
2178     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2179     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2180     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2181     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2182 }\
2183 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2184     uint8_t full[24*17];\
2185     uint8_t halfH[272];\
2186     copy_block17(full, src, 24, stride, 17);\
2187     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2188     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2189     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2190 }\
2191 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2192     uint8_t full[24*17];\
2193     uint8_t halfH[272];\
2194     uint8_t halfV[256];\
2195     uint8_t halfHV[256];\
2196     copy_block17(full, src, 24, stride, 17);\
2197     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2198     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2199     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2200     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2201 }\
2202 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2203     uint8_t full[24*17];\
2204     uint8_t halfH[272];\
2205     copy_block17(full, src, 24, stride, 17);\
2206     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2207     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2208     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2209 }\
2210 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2211     uint8_t halfH[272];\
2212     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2213     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2214 }
2215
2216 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2217 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2218 #define op_put(a, b) a = cm[((b) + 16)>>5]
2219 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2220
2221 QPEL_MC(0, put_       , _       , op_put)
2222 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2223 QPEL_MC(0, avg_       , _       , op_avg)
2224 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2225 #undef op_avg
2226 #undef op_avg_no_rnd
2227 #undef op_put
2228 #undef op_put_no_rnd
2229
2230 #if 1
2231 #define H264_LOWPASS(OPNAME, OP, OP2) \
2232 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2233     const int h=2;\
2234     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2235     int i;\
2236     for(i=0; i<h; i++)\
2237     {\
2238         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2239         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2240         dst+=dstStride;\
2241         src+=srcStride;\
2242     }\
2243 }\
2244 \
2245 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2246     const int w=2;\
2247     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2248     int i;\
2249     for(i=0; i<w; i++)\
2250     {\
2251         const int srcB= src[-2*srcStride];\
2252         const int srcA= src[-1*srcStride];\
2253         const int src0= src[0 *srcStride];\
2254         const int src1= src[1 *srcStride];\
2255         const int src2= src[2 *srcStride];\
2256         const int src3= src[3 *srcStride];\
2257         const int src4= src[4 *srcStride];\
2258         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2259         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2260         dst++;\
2261         src++;\
2262     }\
2263 }\
2264 \
2265 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2266     const int h=2;\
2267     const int w=2;\
2268     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2269     int i;\
2270     src -= 2*srcStride;\
2271     for(i=0; i<h+5; i++)\
2272     {\
2273         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2274         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2275         tmp+=tmpStride;\
2276         src+=srcStride;\
2277     }\
2278     tmp -= tmpStride*(h+5-2);\
2279     for(i=0; i<w; i++)\
2280     {\
2281         const int tmpB= tmp[-2*tmpStride];\
2282         const int tmpA= tmp[-1*tmpStride];\
2283         const int tmp0= tmp[0 *tmpStride];\
2284         const int tmp1= tmp[1 *tmpStride];\
2285         const int tmp2= tmp[2 *tmpStride];\
2286         const int tmp3= tmp[3 *tmpStride];\
2287         const int tmp4= tmp[4 *tmpStride];\
2288         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2289         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2290         dst++;\
2291         tmp++;\
2292     }\
2293 }\
2294 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2295     const int h=4;\
2296     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2297     int i;\
2298     for(i=0; i<h; i++)\
2299     {\
2300         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2301         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2302         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2303         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2304         dst+=dstStride;\
2305         src+=srcStride;\
2306     }\
2307 }\
2308 \
2309 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2310     const int w=4;\
2311     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2312     int i;\
2313     for(i=0; i<w; i++)\
2314     {\
2315         const int srcB= src[-2*srcStride];\
2316         const int srcA= src[-1*srcStride];\
2317         const int src0= src[0 *srcStride];\
2318         const int src1= src[1 *srcStride];\
2319         const int src2= src[2 *srcStride];\
2320         const int src3= src[3 *srcStride];\
2321         const int src4= src[4 *srcStride];\
2322         const int src5= src[5 *srcStride];\
2323         const int src6= src[6 *srcStride];\
2324         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2325         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2326         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2327         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2328         dst++;\
2329         src++;\
2330     }\
2331 }\
2332 \
2333 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2334     const int h=4;\
2335     const int w=4;\
2336     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2337     int i;\
2338     src -= 2*srcStride;\
2339     for(i=0; i<h+5; i++)\
2340     {\
2341         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2342         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2343         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2344         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2345         tmp+=tmpStride;\
2346         src+=srcStride;\
2347     }\
2348     tmp -= tmpStride*(h+5-2);\
2349     for(i=0; i<w; i++)\
2350     {\
2351         const int tmpB= tmp[-2*tmpStride];\
2352         const int tmpA= tmp[-1*tmpStride];\
2353         const int tmp0= tmp[0 *tmpStride];\
2354         const int tmp1= tmp[1 *tmpStride];\
2355         const int tmp2= tmp[2 *tmpStride];\
2356         const int tmp3= tmp[3 *tmpStride];\
2357         const int tmp4= tmp[4 *tmpStride];\
2358         const int tmp5= tmp[5 *tmpStride];\
2359         const int tmp6= tmp[6 *tmpStride];\
2360         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2361         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2362         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2363         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2364         dst++;\
2365         tmp++;\
2366     }\
2367 }\
2368 \
2369 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2370     const int h=8;\
2371     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2372     int i;\
2373     for(i=0; i<h; i++)\
2374     {\
2375         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2376         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2377         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2378         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2379         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2380         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2381         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2382         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2383         dst+=dstStride;\
2384         src+=srcStride;\
2385     }\
2386 }\
2387 \
2388 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2389     const int w=8;\
2390     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2391     int i;\
2392     for(i=0; i<w; i++)\
2393     {\
2394         const int srcB= src[-2*srcStride];\
2395         const int srcA= src[-1*srcStride];\
2396         const int src0= src[0 *srcStride];\
2397         const int src1= src[1 *srcStride];\
2398         const int src2= src[2 *srcStride];\
2399         const int src3= src[3 *srcStride];\
2400         const int src4= src[4 *srcStride];\
2401         const int src5= src[5 *srcStride];\
2402         const int src6= src[6 *srcStride];\
2403         const int src7= src[7 *srcStride];\
2404         const int src8= src[8 *srcStride];\
2405         const int src9= src[9 *srcStride];\
2406         const int src10=src[10*srcStride];\
2407         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2408         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2409         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2410         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2411         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2412         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2413         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2414         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2415         dst++;\
2416         src++;\
2417     }\
2418 }\
2419 \
2420 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2421     const int h=8;\
2422     const int w=8;\
2423     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2424     int i;\
2425     src -= 2*srcStride;\
2426     for(i=0; i<h+5; i++)\
2427     {\
2428         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2429         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2430         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2431         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2432         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2433         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2434         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2435         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2436         tmp+=tmpStride;\
2437         src+=srcStride;\
2438     }\
2439     tmp -= tmpStride*(h+5-2);\
2440     for(i=0; i<w; i++)\
2441     {\
2442         const int tmpB= tmp[-2*tmpStride];\
2443         const int tmpA= tmp[-1*tmpStride];\
2444         const int tmp0= tmp[0 *tmpStride];\
2445         const int tmp1= tmp[1 *tmpStride];\
2446         const int tmp2= tmp[2 *tmpStride];\
2447         const int tmp3= tmp[3 *tmpStride];\
2448         const int tmp4= tmp[4 *tmpStride];\
2449         const int tmp5= tmp[5 *tmpStride];\
2450         const int tmp6= tmp[6 *tmpStride];\
2451         const int tmp7= tmp[7 *tmpStride];\
2452         const int tmp8= tmp[8 *tmpStride];\
2453         const int tmp9= tmp[9 *tmpStride];\
2454         const int tmp10=tmp[10*tmpStride];\
2455         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2456         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2457         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2458         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2459         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2460         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2461         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2462         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2463         dst++;\
2464         tmp++;\
2465     }\
2466 }\
2467 \
2468 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2469     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2470     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2471     src += 8*srcStride;\
2472     dst += 8*dstStride;\
2473     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2474     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2475 }\
2476 \
2477 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2478     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2479     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2480     src += 8*srcStride;\
2481     dst += 8*dstStride;\
2482     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2483     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2484 }\
2485 \
2486 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2487     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2488     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2489     src += 8*srcStride;\
2490     dst += 8*dstStride;\
2491     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2492     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2493 }\
2494
2495 #define H264_MC(OPNAME, SIZE) \
2496 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2497     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2498 }\
2499 \
2500 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2501     uint8_t half[SIZE*SIZE];\
2502     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2503     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2504 }\
2505 \
2506 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2507     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2508 }\
2509 \
2510 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2511     uint8_t half[SIZE*SIZE];\
2512     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2513     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2514 }\
2515 \
2516 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2517     uint8_t full[SIZE*(SIZE+5)];\
2518     uint8_t * const full_mid= full + SIZE*2;\
2519     uint8_t half[SIZE*SIZE];\
2520     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2521     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2522     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2523 }\
2524 \
2525 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2526     uint8_t full[SIZE*(SIZE+5)];\
2527     uint8_t * const full_mid= full + SIZE*2;\
2528     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2529     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2530 }\
2531 \
2532 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2533     uint8_t full[SIZE*(SIZE+5)];\
2534     uint8_t * const full_mid= full + SIZE*2;\
2535     uint8_t half[SIZE*SIZE];\
2536     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2537     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2538     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2539 }\
2540 \
2541 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2542     uint8_t full[SIZE*(SIZE+5)];\
2543     uint8_t * const full_mid= full + SIZE*2;\
2544     uint8_t halfH[SIZE*SIZE];\
2545     uint8_t halfV[SIZE*SIZE];\
2546     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2547     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2548     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2549     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2550 }\
2551 \
2552 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2553     uint8_t full[SIZE*(SIZE+5)];\
2554     uint8_t * const full_mid= full + SIZE*2;\
2555     uint8_t halfH[SIZE*SIZE];\
2556     uint8_t halfV[SIZE*SIZE];\
2557     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2558     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2559     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2560     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2561 }\
2562 \
2563 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2564     uint8_t full[SIZE*(SIZE+5)];\
2565     uint8_t * const full_mid= full + SIZE*2;\
2566     uint8_t halfH[SIZE*SIZE];\
2567     uint8_t halfV[SIZE*SIZE];\
2568     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2569     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2570     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2571     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2572 }\
2573 \
2574 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2575     uint8_t full[SIZE*(SIZE+5)];\
2576     uint8_t * const full_mid= full + SIZE*2;\
2577     uint8_t halfH[SIZE*SIZE];\
2578     uint8_t halfV[SIZE*SIZE];\
2579     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2580     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2581     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2582     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2583 }\
2584 \
2585 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2586     int16_t tmp[SIZE*(SIZE+5)];\
2587     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2588 }\
2589 \
2590 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2591     int16_t tmp[SIZE*(SIZE+5)];\
2592     uint8_t halfH[SIZE*SIZE];\
2593     uint8_t halfHV[SIZE*SIZE];\
2594     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2595     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2596     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2597 }\
2598 \
2599 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2600     int16_t tmp[SIZE*(SIZE+5)];\
2601     uint8_t halfH[SIZE*SIZE];\
2602     uint8_t halfHV[SIZE*SIZE];\
2603     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2604     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2605     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2606 }\
2607 \
2608 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2609     uint8_t full[SIZE*(SIZE+5)];\
2610     uint8_t * const full_mid= full + SIZE*2;\
2611     int16_t tmp[SIZE*(SIZE+5)];\
2612     uint8_t halfV[SIZE*SIZE];\
2613     uint8_t halfHV[SIZE*SIZE];\
2614     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2615     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2616     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2617     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2618 }\
2619 \
2620 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2621     uint8_t full[SIZE*(SIZE+5)];\
2622     uint8_t * const full_mid= full + SIZE*2;\
2623     int16_t tmp[SIZE*(SIZE+5)];\
2624     uint8_t halfV[SIZE*SIZE];\
2625     uint8_t halfHV[SIZE*SIZE];\
2626     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2627     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2628     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2629     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2630 }\
2631
2632 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2633 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2634 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2635 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2636 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2637
2638 H264_LOWPASS(put_       , op_put, op2_put)
2639 H264_LOWPASS(avg_       , op_avg, op2_avg)
2640 H264_MC(put_, 2)
2641 H264_MC(put_, 4)
2642 H264_MC(put_, 8)
2643 H264_MC(put_, 16)
2644 H264_MC(avg_, 4)
2645 H264_MC(avg_, 8)
2646 H264_MC(avg_, 16)
2647
2648 #undef op_avg
2649 #undef op_put
2650 #undef op2_avg
2651 #undef op2_put
2652 #endif
2653
2654 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2655 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2656 #define H264_WEIGHT(W,H) \
2657 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2658     int y; \
2659     offset <<= log2_denom; \
2660     if(log2_denom) offset += 1<<(log2_denom-1); \
2661     for(y=0; y<H; y++, block += stride){ \
2662         op_scale1(0); \
2663         op_scale1(1); \
2664         if(W==2) continue; \
2665         op_scale1(2); \
2666         op_scale1(3); \
2667         if(W==4) continue; \
2668         op_scale1(4); \
2669         op_scale1(5); \
2670         op_scale1(6); \
2671         op_scale1(7); \
2672         if(W==8) continue; \
2673         op_scale1(8); \
2674         op_scale1(9); \
2675         op_scale1(10); \
2676         op_scale1(11); \
2677         op_scale1(12); \
2678         op_scale1(13); \
2679         op_scale1(14); \
2680         op_scale1(15); \
2681     } \
2682 } \
2683 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2684     int y; \
2685     offset = ((offset + 1) | 1) << log2_denom; \
2686     for(y=0; y<H; y++, dst += stride, src += stride){ \
2687         op_scale2(0); \
2688         op_scale2(1); \
2689         if(W==2) continue; \
2690         op_scale2(2); \
2691         op_scale2(3); \
2692         if(W==4) continue; \
2693         op_scale2(4); \
2694         op_scale2(5); \
2695         op_scale2(6); \
2696         op_scale2(7); \
2697         if(W==8) continue; \
2698         op_scale2(8); \
2699         op_scale2(9); \
2700         op_scale2(10); \
2701         op_scale2(11); \
2702         op_scale2(12); \
2703         op_scale2(13); \
2704         op_scale2(14); \
2705         op_scale2(15); \
2706     } \
2707 }
2708
2709 H264_WEIGHT(16,16)
2710 H264_WEIGHT(16,8)
2711 H264_WEIGHT(8,16)
2712 H264_WEIGHT(8,8)
2713 H264_WEIGHT(8,4)
2714 H264_WEIGHT(4,8)
2715 H264_WEIGHT(4,4)
2716 H264_WEIGHT(4,2)
2717 H264_WEIGHT(2,4)
2718 H264_WEIGHT(2,2)
2719
2720 #undef op_scale1
2721 #undef op_scale2
2722 #undef H264_WEIGHT
2723
2724 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2725     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2726     int i;
2727
2728     for(i=0; i<h; i++){
2729         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2730         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2731         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2732         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2733         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2734         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2735         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2736         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2737         dst+=dstStride;
2738         src+=srcStride;
2739     }
2740 }
2741
2742 #if CONFIG_CAVS_DECODER
2743 /* AVS specific */
2744 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2745
2746 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2747     put_pixels8_c(dst, src, stride, 8);
2748 }
2749 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2750     avg_pixels8_c(dst, src, stride, 8);
2751 }
2752 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2753     put_pixels16_c(dst, src, stride, 16);
2754 }
2755 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2756     avg_pixels16_c(dst, src, stride, 16);
2757 }
2758 #endif /* CONFIG_CAVS_DECODER */
2759
2760 void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
2761
2762 #if CONFIG_VC1_DECODER
2763 /* VC-1 specific */
2764 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2765
2766 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2767     put_pixels8_c(dst, src, stride, 8);
2768 }
2769 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2770     avg_pixels8_c(dst, src, stride, 8);
2771 }
2772 #endif /* CONFIG_VC1_DECODER */
2773
2774 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2775
2776 /* H264 specific */
2777 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2778
2779 #if CONFIG_RV30_DECODER
2780 void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2781 #endif /* CONFIG_RV30_DECODER */
2782
2783 #if CONFIG_RV40_DECODER
2784 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2785     put_pixels16_xy2_c(dst, src, stride, 16);
2786 }
2787 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2788     avg_pixels16_xy2_c(dst, src, stride, 16);
2789 }
2790 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2791     put_pixels8_xy2_c(dst, src, stride, 8);
2792 }
2793 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2794     avg_pixels8_xy2_c(dst, src, stride, 8);
2795 }
2796
2797 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2798 #endif /* CONFIG_RV40_DECODER */
2799
2800 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2801     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2802     int i;
2803
2804     for(i=0; i<w; i++){
2805         const int src_1= src[ -srcStride];
2806         const int src0 = src[0          ];
2807         const int src1 = src[  srcStride];
2808         const int src2 = src[2*srcStride];
2809         const int src3 = src[3*srcStride];
2810         const int src4 = src[4*srcStride];
2811         const int src5 = src[5*srcStride];
2812         const int src6 = src[6*srcStride];
2813         const int src7 = src[7*srcStride];
2814         const int src8 = src[8*srcStride];
2815         const int src9 = src[9*srcStride];
2816         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2817         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2818         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2819         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2820         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2821         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2822         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2823         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2824         src++;
2825         dst++;
2826     }
2827 }
2828
2829 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2830     put_pixels8_c(dst, src, stride, 8);
2831 }
2832
2833 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2834     uint8_t half[64];
2835     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2836     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2837 }
2838
2839 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2840     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2841 }
2842
2843 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2844     uint8_t half[64];
2845     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2846     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2847 }
2848
2849 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2850     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2851 }
2852
2853 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2854     uint8_t halfH[88];
2855     uint8_t halfV[64];
2856     uint8_t halfHV[64];
2857     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2858     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2859     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2860     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2861 }
2862 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2863     uint8_t halfH[88];
2864     uint8_t halfV[64];
2865     uint8_t halfHV[64];
2866     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2867     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2868     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2869     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2870 }
2871 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2872     uint8_t halfH[88];
2873     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2874     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2875 }
2876
2877 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2878     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2879     int x;
2880     const int strength= ff_h263_loop_filter_strength[qscale];
2881
2882     for(x=0; x<8; x++){
2883         int d1, d2, ad1;
2884         int p0= src[x-2*stride];
2885         int p1= src[x-1*stride];
2886         int p2= src[x+0*stride];
2887         int p3= src[x+1*stride];
2888         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2889
2890         if     (d<-2*strength) d1= 0;
2891         else if(d<-  strength) d1=-2*strength - d;
2892         else if(d<   strength) d1= d;
2893         else if(d< 2*strength) d1= 2*strength - d;
2894         else                   d1= 0;
2895
2896         p1 += d1;
2897         p2 -= d1;
2898         if(p1&256) p1= ~(p1>>31);
2899         if(p2&256) p2= ~(p2>>31);
2900
2901         src[x-1*stride] = p1;
2902         src[x+0*stride] = p2;
2903
2904         ad1= FFABS(d1)>>1;
2905
2906         d2= av_clip((p0-p3)/4, -ad1, ad1);
2907
2908         src[x-2*stride] = p0 - d2;
2909         src[x+  stride] = p3 + d2;
2910     }
2911     }
2912 }
2913
2914 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2915     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2916     int y;
2917     const int strength= ff_h263_loop_filter_strength[qscale];
2918
2919     for(y=0; y<8; y++){
2920         int d1, d2, ad1;
2921         int p0= src[y*stride-2];
2922         int p1= src[y*stride-1];
2923         int p2= src[y*stride+0];
2924         int p3= src[y*stride+1];
2925         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2926
2927         if     (d<-2*strength) d1= 0;
2928         else if(d<-  strength) d1=-2*strength - d;
2929         else if(d<   strength) d1= d;
2930         else if(d< 2*strength) d1= 2*strength - d;
2931         else                   d1= 0;
2932
2933         p1 += d1;
2934         p2 -= d1;
2935         if(p1&256) p1= ~(p1>>31);
2936         if(p2&256) p2= ~(p2>>31);
2937
2938         src[y*stride-1] = p1;
2939         src[y*stride+0] = p2;
2940
2941         ad1= FFABS(d1)>>1;
2942
2943         d2= av_clip((p0-p3)/4, -ad1, ad1);
2944
2945         src[y*stride-2] = p0 - d2;
2946         src[y*stride+1] = p3 + d2;
2947     }
2948     }
2949 }
2950
2951 static void h261_loop_filter_c(uint8_t *src, int stride){
2952     int x,y,xy,yz;
2953     int temp[64];
2954
2955     for(x=0; x<8; x++){
2956         temp[x      ] = 4*src[x           ];
2957         temp[x + 7*8] = 4*src[x + 7*stride];
2958     }
2959     for(y=1; y<7; y++){
2960         for(x=0; x<8; x++){
2961             xy = y * stride + x;
2962             yz = y * 8 + x;
2963             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2964         }
2965     }
2966
2967     for(y=0; y<8; y++){
2968         src[  y*stride] = (temp[  y*8] + 2)>>2;
2969         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2970         for(x=1; x<7; x++){
2971             xy = y * stride + x;
2972             yz = y * 8 + x;
2973             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2974         }
2975     }
2976 }
2977
2978 static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2979 {
2980     int i, d;
2981     for( i = 0; i < 4; i++ ) {
2982         if( tc0[i] < 0 ) {
2983             pix += 4*ystride;
2984             continue;
2985         }
2986         for( d = 0; d < 4; d++ ) {
2987             const int p0 = pix[-1*xstride];
2988             const int p1 = pix[-2*xstride];
2989             const int p2 = pix[-3*xstride];
2990             const int q0 = pix[0];
2991             const int q1 = pix[1*xstride];
2992             const int q2 = pix[2*xstride];
2993
2994             if( FFABS( p0 - q0 ) < alpha &&
2995                 FFABS( p1 - p0 ) < beta &&
2996                 FFABS( q1 - q0 ) < beta ) {
2997
2998                 int tc = tc0[i];
2999                 int i_delta;
3000
3001                 if( FFABS( p2 - p0 ) < beta ) {
3002                     if(tc0[i])
3003                     pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3004                     tc++;
3005                 }
3006                 if( FFABS( q2 - q0 ) < beta ) {
3007                     if(tc0[i])
3008                     pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3009                     tc++;
3010                 }
3011
3012                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3013                 pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
3014                 pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
3015             }
3016             pix += ystride;
3017         }
3018     }
3019 }
3020 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3021 {
3022     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3023 }
3024 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3025 {
3026     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3027 }
3028
3029 static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3030 {
3031     int d;
3032     for( d = 0; d < 16; d++ ) {
3033         const int p2 = pix[-3*xstride];
3034         const int p1 = pix[-2*xstride];
3035         const int p0 = pix[-1*xstride];
3036
3037         const int q0 = pix[ 0*xstride];
3038         const int q1 = pix[ 1*xstride];
3039         const int q2 = pix[ 2*xstride];
3040
3041         if( FFABS( p0 - q0 ) < alpha &&
3042             FFABS( p1 - p0 ) < beta &&
3043             FFABS( q1 - q0 ) < beta ) {
3044
3045             if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3046                 if( FFABS( p2 - p0 ) < beta)
3047                 {
3048                     const int p3 = pix[-4*xstride];
3049                     /* p0', p1', p2' */
3050                     pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3051                     pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3052                     pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3053                 } else {
3054                     /* p0' */
3055                     pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3056                 }
3057                 if( FFABS( q2 - q0 ) < beta)
3058                 {
3059                     const int q3 = pix[3*xstride];
3060                     /* q0', q1', q2' */
3061                     pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3062                     pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3063                     pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3064                 } else {
3065                     /* q0' */
3066                     pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3067                 }
3068             }else{
3069                 /* p0', q0' */
3070                 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3071                 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3072             }
3073         }
3074         pix += ystride;
3075     }
3076 }
3077 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3078 {
3079     h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3080 }
3081 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3082 {
3083     h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3084 }
3085
3086 static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3087 {
3088     int i, d;
3089     for( i = 0; i < 4; i++ ) {
3090         const int tc = tc0[i];
3091         if( tc <= 0 ) {
3092             pix += 2*ystride;
3093             continue;
3094         }
3095         for( d = 0; d < 2; d++ ) {
3096             const int p0 = pix[-1*xstride];
3097             const int p1 = pix[-2*xstride];
3098             const int q0 = pix[0];
3099             const int q1 = pix[1*xstride];
3100
3101             if( FFABS( p0 - q0 ) < alpha &&
3102                 FFABS( p1 - p0 ) < beta &&
3103                 FFABS( q1 - q0 ) < beta ) {
3104
3105                 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3106
3107                 pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3108                 pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3109             }
3110             pix += ystride;
3111         }
3112     }
3113 }
3114 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3115 {
3116     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3117 }
3118 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3119 {
3120     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3121 }
3122
3123 static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3124 {
3125     int d;
3126     for( d = 0; d < 8; d++ ) {
3127         const int p0 = pix[-1*xstride];
3128         const int p1 = pix[-2*xstride];
3129         const int q0 = pix[0];
3130         const int q1 = pix[1*xstride];
3131
3132         if( FFABS( p0 - q0 ) < alpha &&
3133             FFABS( p1 - p0 ) < beta &&
3134             FFABS( q1 - q0 ) < beta ) {
3135
3136             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3137             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3138         }
3139         pix += ystride;
3140     }
3141 }
3142 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3143 {
3144     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3145 }
3146 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3147 {
3148     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3149 }
3150
3151 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3152 {
3153     int s, i;
3154
3155     s = 0;
3156     for(i=0;i<h;i++) {
3157         s += abs(pix1[0] - pix2[0]);
3158         s += abs(pix1[1] - pix2[1]);
3159         s += abs(pix1[2] - pix2[2]);
3160         s += abs(pix1[3] - pix2[3]);
3161         s += abs(pix1[4] - pix2[4]);
3162         s += abs(pix1[5] - pix2[5]);
3163         s += abs(pix1[6] - pix2[6]);
3164         s += abs(pix1[7] - pix2[7]);
3165         s += abs(pix1[8] - pix2[8]);
3166         s += abs(pix1[9] - pix2[9]);
3167         s += abs(pix1[10] - pix2[10]);
3168         s += abs(pix1[11] - pix2[11]);
3169         s += abs(pix1[12] - pix2[12]);
3170         s += abs(pix1[13] - pix2[13]);
3171         s += abs(pix1[14] - pix2[14]);
3172         s += abs(pix1[15] - pix2[15]);
3173         pix1 += line_size;
3174         pix2 += line_size;
3175     }
3176     return s;
3177 }
3178
3179 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3180 {
3181     int s, i;
3182
3183     s = 0;
3184     for(i=0;i<h;i++) {
3185         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3186         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3187         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3188         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3189         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3190         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3191         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3192         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3193         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3194         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3195         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3196         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3197         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3198         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3199         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3200         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3201         pix1 += line_size;
3202         pix2 += line_size;
3203     }
3204     return s;
3205 }
3206
3207 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3208 {
3209     int s, i;
3210     uint8_t *pix3 = pix2 + line_size;
3211
3212     s = 0;
3213     for(i=0;i<h;i++) {
3214         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3215         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3216         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3217         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3218         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3219         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3220         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3221         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3222         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3223         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3224         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3225         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3226         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3227         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3228         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3229         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3230         pix1 += line_size;
3231         pix2 += line_size;
3232         pix3 += line_size;
3233     }
3234     return s;
3235 }
3236
3237 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3238 {
3239     int s, i;
3240     uint8_t *pix3 = pix2 + line_size;
3241
3242     s = 0;
3243     for(i=0;i<h;i++) {
3244         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3245         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3246         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3247         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3248         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3249         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3250         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3251         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3252         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3253         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3254         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3255         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3256         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3257         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3258         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3259         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3260         pix1 += line_size;
3261         pix2 += line_size;
3262         pix3 += line_size;
3263     }
3264     return s;
3265 }
3266
3267 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3268 {
3269     int s, i;
3270
3271     s = 0;
3272     for(i=0;i<h;i++) {
3273         s += abs(pix1[0] - pix2[0]);
3274         s += abs(pix1[1] - pix2[1]);
3275         s += abs(pix1[2] - pix2[2]);
3276         s += abs(pix1[3] - pix2[3]);
3277         s += abs(pix1[4] - pix2[4]);
3278         s += abs(pix1[5] - pix2[5]);
3279         s += abs(pix1[6] - pix2[6]);
3280         s += abs(pix1[7] - pix2[7]);
3281         pix1 += line_size;
3282         pix2 += line_size;
3283     }
3284     return s;
3285 }
3286
3287 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3288 {
3289     int s, i;
3290
3291     s = 0;
3292     for(i=0;i<h;i++) {
3293         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3294         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3295         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3296         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3297         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3298         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3299         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3300         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3301         pix1 += line_size;
3302         pix2 += line_size;
3303     }
3304     return s;
3305 }
3306
3307 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3308 {
3309     int s, i;
3310     uint8_t *pix3 = pix2 + line_size;
3311
3312     s = 0;
3313     for(i=0;i<h;i++) {
3314         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3315         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3316         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3317         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3318         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3319         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3320         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3321         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3322         pix1 += line_size;
3323         pix2 += line_size;
3324         pix3 += line_size;
3325     }
3326     return s;
3327 }
3328
3329 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3330 {
3331     int s, i;
3332     uint8_t *pix3 = pix2 + line_size;
3333
3334     s = 0;
3335     for(i=0;i<h;i++) {
3336         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3337         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3338         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3339         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3340         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3341         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3342         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3343         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3344         pix1 += line_size;
3345         pix2 += line_size;
3346         pix3 += line_size;
3347     }
3348     return s;
3349 }
3350
3351 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3352     MpegEncContext *c = v;
3353     int score1=0;
3354     int score2=0;
3355     int x,y;
3356
3357     for(y=0; y<h; y++){
3358         for(x=0; x<16; x++){
3359             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3360         }
3361         if(y+1<h){
3362             for(x=0; x<15; x++){
3363                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3364                              - s1[x+1] + s1[x+1+stride])
3365                         -FFABS(  s2[x  ] - s2[x  +stride]
3366                              - s2[x+1] + s2[x+1+stride]);
3367             }
3368         }
3369         s1+= stride;
3370         s2+= stride;
3371     }
3372
3373     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3374     else  return score1 + FFABS(score2)*8;
3375 }
3376
3377 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3378     MpegEncContext *c = v;
3379     int score1=0;
3380     int score2=0;
3381     int x,y;
3382
3383     for(y=0; y<h; y++){
3384         for(x=0; x<8; x++){
3385             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3386         }
3387         if(y+1<h){
3388             for(x=0; x<7; x++){
3389                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3390                              - s1[x+1] + s1[x+1+stride])
3391                         -FFABS(  s2[x  ] - s2[x  +stride]
3392                              - s2[x+1] + s2[x+1+stride]);
3393             }
3394         }
3395         s1+= stride;
3396         s2+= stride;
3397     }
3398
3399     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3400     else  return score1 + FFABS(score2)*8;
3401 }
3402
3403 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3404     int i;
3405     unsigned int sum=0;
3406
3407     for(i=0; i<8*8; i++){
3408         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3409         int w= weight[i];
3410         b>>= RECON_SHIFT;
3411         assert(-512<b && b<512);
3412
3413         sum += (w*b)*(w*b)>>4;
3414     }
3415     return sum>>2;
3416 }
3417
3418 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3419     int i;
3420
3421     for(i=0; i<8*8; i++){
3422         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3423     }
3424 }
3425
3426 /**
3427  * permutes an 8x8 block.
3428  * @param block the block which will be permuted according to the given permutation vector
3429  * @param permutation the permutation vector
3430  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3431  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3432  *                  (inverse) permutated to scantable order!
3433  */
3434 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3435 {
3436     int i;
3437     DCTELEM temp[64];
3438
3439     if(last<=0) return;
3440     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3441
3442     for(i=0; i<=last; i++){
3443         const int j= scantable[i];
3444         temp[j]= block[j];
3445         block[j]=0;
3446     }
3447
3448     for(i=0; i<=last; i++){
3449         const int j= scantable[i];
3450         const int perm_j= permutation[j];
3451         block[perm_j]= temp[j];
3452     }
3453 }
3454
3455 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3456     return 0;
3457 }
3458
3459 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3460     int i;
3461
3462     memset(cmp, 0, sizeof(void*)*6);
3463
3464     for(i=0; i<6; i++){
3465         switch(type&0xFF){
3466         case FF_CMP_SAD:
3467             cmp[i]= c->sad[i];
3468             break;
3469         case FF_CMP_SATD:
3470             cmp[i]= c->hadamard8_diff[i];
3471             break;
3472         case FF_CMP_SSE:
3473             cmp[i]= c->sse[i];
3474             break;
3475         case FF_CMP_DCT:
3476             cmp[i]= c->dct_sad[i];
3477             break;
3478         case FF_CMP_DCT264:
3479             cmp[i]= c->dct264_sad[i];
3480             break;
3481         case FF_CMP_DCTMAX:
3482             cmp[i]= c->dct_max[i];
3483             break;
3484         case FF_CMP_PSNR:
3485             cmp[i]= c->quant_psnr[i];
3486             break;
3487         case FF_CMP_BIT:
3488             cmp[i]= c->bit[i];
3489             break;
3490         case FF_CMP_RD:
3491             cmp[i]= c->rd[i];
3492             break;
3493         case FF_CMP_VSAD:
3494             cmp[i]= c->vsad[i];
3495             break;
3496         case FF_CMP_VSSE:
3497             cmp[i]= c->vsse[i];
3498             break;
3499         case FF_CMP_ZERO:
3500             cmp[i]= zero_cmp;
3501             break;
3502         case FF_CMP_NSSE:
3503             cmp[i]= c->nsse[i];
3504             break;
3505 #if CONFIG_SNOW_ENCODER
3506         case FF_CMP_W53:
3507             cmp[i]= c->w53[i];
3508             break;
3509         case FF_CMP_W97:
3510             cmp[i]= c->w97[i];
3511             break;
3512 #endif
3513         default:
3514             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3515         }
3516     }
3517 }
3518
3519 static void clear_block_c(DCTELEM *block)
3520 {
3521     memset(block, 0, sizeof(DCTELEM)*64);
3522 }
3523
3524 /**
3525  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3526  */
3527 static void clear_blocks_c(DCTELEM *blocks)
3528 {
3529     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3530 }
3531
3532 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3533     long i;
3534     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3535         long a = *(long*)(src+i);
3536         long b = *(long*)(dst+i);
3537         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3538     }
3539     for(; i<w; i++)
3540         dst[i+0] += src[i+0];
3541 }
3542
3543 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3544     long i;
3545     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3546         long a = *(long*)(src1+i);
3547         long b = *(long*)(src2+i);
3548         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3549     }
3550     for(; i<w; i++)
3551         dst[i] = src1[i]+src2[i];
3552 }
3553
3554 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3555     long i;
3556 #if !HAVE_FAST_UNALIGNED
3557     if((long)src2 & (sizeof(long)-1)){
3558         for(i=0; i+7<w; i+=8){
3559             dst[i+0] = src1[i+0]-src2[i+0];
3560             dst[i+1] = src1[i+1]-src2[i+1];
3561             dst[i+2] = src1[i+2]-src2[i+2];
3562             dst[i+3] = src1[i+3]-src2[i+3];
3563             dst[i+4] = src1[i+4]-src2[i+4];
3564             dst[i+5] = src1[i+5]-src2[i+5];
3565             dst[i+6] = src1[i+6]-src2[i+6];
3566             dst[i+7] = src1[i+7]-src2[i+7];
3567         }
3568     }else
3569 #endif
3570     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3571         long a = *(long*)(src1+i);
3572         long b = *(long*)(src2+i);
3573         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3574     }
3575     for(; i<w; i++)
3576         dst[i+0] = src1[i+0]-src2[i+0];
3577 }
3578
3579 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3580     int i;
3581     uint8_t l, lt;
3582
3583     l= *left;
3584     lt= *left_top;
3585
3586     for(i=0; i<w; i++){
3587         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3588         lt= src1[i];
3589         dst[i]= l;
3590     }
3591
3592     *left= l;
3593     *left_top= lt;
3594 }
3595
3596 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3597     int i;
3598     uint8_t l, lt;
3599
3600     l= *left;
3601     lt= *left_top;
3602
3603     for(i=0; i<w; i++){
3604         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3605         lt= src1[i];
3606         l= src2[i];
3607         dst[i]= l - pred;
3608     }
3609
3610     *left= l;
3611     *left_top= lt;
3612 }
3613
3614 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3615     int i;
3616
3617     for(i=0; i<w-1; i++){
3618         acc+= src[i];
3619         dst[i]= acc;
3620         i++;
3621         acc+= src[i];
3622         dst[i]= acc;
3623     }
3624
3625     for(; i<w; i++){
3626         acc+= src[i];
3627         dst[i]= acc;
3628     }
3629
3630     return acc;
3631 }
3632
3633 #if HAVE_BIGENDIAN
3634 #define B 3
3635 #define G 2
3636 #define R 1
3637 #define A 0
3638 #else
3639 #define B 0
3640 #define G 1
3641 #define R 2
3642 #define A 3
3643 #endif
3644 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3645     int i;
3646     int r,g,b,a;
3647     r= *red;
3648     g= *green;
3649     b= *blue;
3650     a= *alpha;
3651
3652     for(i=0; i<w; i++){
3653         b+= src[4*i+B];
3654         g+= src[4*i+G];
3655         r+= src[4*i+R];
3656         a+= src[4*i+A];
3657
3658         dst[4*i+B]= b;
3659         dst[4*i+G]= g;
3660         dst[4*i+R]= r;
3661         dst[4*i+A]= a;
3662     }
3663
3664     *red= r;
3665     *green= g;
3666     *blue= b;
3667     *alpha= a;
3668 }
3669 #undef B
3670 #undef G
3671 #undef R
3672 #undef A
3673
3674 #define BUTTERFLY2(o1,o2,i1,i2) \
3675 o1= (i1)+(i2);\
3676 o2= (i1)-(i2);
3677
3678 #define BUTTERFLY1(x,y) \
3679 {\
3680     int a,b;\
3681     a= x;\
3682     b= y;\
3683     x= a+b;\
3684     y= a-b;\
3685 }
3686
3687 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3688
3689 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3690     int i;
3691     int temp[64];
3692     int sum=0;
3693
3694     assert(h==8);
3695
3696     for(i=0; i<8; i++){
3697         //FIXME try pointer walks
3698         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3699         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3700         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3701         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3702
3703         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3704         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3705         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3706         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3707
3708         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3709         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3710         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3711         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3712     }
3713
3714     for(i=0; i<8; i++){
3715         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3716         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3717         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3718         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3719
3720         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3721         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3722         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3723         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3724
3725         sum +=
3726              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3727             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3728             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3729             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3730     }
3731 #if 0
3732 static int maxi=0;
3733 if(sum>maxi){
3734     maxi=sum;
3735     printf("MAX:%d\n", maxi);
3736 }
3737 #endif
3738     return sum;
3739 }
3740
3741 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3742     int i;
3743     int temp[64];
3744     int sum=0;
3745
3746     assert(h==8);
3747
3748     for(i=0; i<8; i++){
3749         //FIXME try pointer walks
3750         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3751         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3752         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3753         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3754
3755         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3756         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3757         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3758         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3759
3760         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3761         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3762         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3763         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3764     }
3765
3766     for(i=0; i<8; i++){
3767         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3768         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3769         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3770         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3771
3772         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3773         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3774         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3775         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3776
3777         sum +=
3778              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3779             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3780             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3781             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3782     }
3783
3784     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3785
3786     return sum;
3787 }
3788
3789 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3790     MpegEncContext * const s= (MpegEncContext *)c;
3791     DECLARE_ALIGNED_16(uint64_t, aligned_temp)[sizeof(DCTELEM)*64/8];
3792     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3793
3794     assert(h==8);
3795
3796     s->dsp.diff_pixels(temp, src1, src2, stride);
3797     s->dsp.fdct(temp);
3798     return s->dsp.sum_abs_dctelem(temp);
3799 }
3800
3801 #if CONFIG_GPL
3802 #define DCT8_1D {\
3803     const int s07 = SRC(0) + SRC(7);\
3804     const int s16 = SRC(1) + SRC(6);\
3805     const int s25 = SRC(2) + SRC(5);\
3806     const int s34 = SRC(3) + SRC(4);\
3807     const int a0 = s07 + s34;\
3808     const int a1 = s16 + s25;\
3809     const int a2 = s07 - s34;\
3810     const int a3 = s16 - s25;\
3811     const int d07 = SRC(0) - SRC(7);\
3812     const int d16 = SRC(1) - SRC(6);\
3813     const int d25 = SRC(2) - SRC(5);\
3814     const int d34 = SRC(3) - SRC(4);\
3815     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3816     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3817     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3818     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3819     DST(0,  a0 + a1     ) ;\
3820     DST(1,  a4 + (a7>>2)) ;\
3821     DST(2,  a2 + (a3>>1)) ;\
3822     DST(3,  a5 + (a6>>2)) ;\
3823     DST(4,  a0 - a1     ) ;\
3824     DST(5,  a6 - (a5>>2)) ;\
3825     DST(6, (a2>>1) - a3 ) ;\
3826     DST(7, (a4>>2) - a7 ) ;\
3827 }
3828
3829 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3830     MpegEncContext * const s= (MpegEncContext *)c;
3831     DCTELEM dct[8][8];
3832     int i;
3833     int sum=0;
3834
3835     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3836
3837 #define SRC(x) dct[i][x]
3838 #define DST(x,v) dct[i][x]= v
3839     for( i = 0; i < 8; i++ )
3840         DCT8_1D
3841 #undef SRC
3842 #undef DST
3843
3844 #define SRC(x) dct[x][i]
3845 #define DST(x,v) sum += FFABS(v)
3846     for( i = 0; i < 8; i++ )
3847         DCT8_1D
3848 #undef SRC
3849 #undef DST
3850     return sum;
3851 }
3852 #endif
3853
3854 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3855     MpegEncContext * const s= (MpegEncContext *)c;
3856     DECLARE_ALIGNED_16(uint64_t, aligned_temp)[sizeof(DCTELEM)*64/8];
3857     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3858     int sum=0, i;
3859
3860     assert(h==8);
3861
3862     s->dsp.diff_pixels(temp, src1, src2, stride);
3863     s->dsp.fdct(temp);
3864
3865     for(i=0; i<64; i++)
3866         sum= FFMAX(sum, FFABS(temp[i]));
3867
3868     return sum;
3869 }
3870
3871 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3872     MpegEncContext * const s= (MpegEncContext *)c;
3873     DECLARE_ALIGNED_16(uint64_t, aligned_temp)[sizeof(DCTELEM)*64*2/8];
3874     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3875     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3876     int sum=0, i;
3877
3878     assert(h==8);
3879     s->mb_intra=0;
3880
3881     s->dsp.diff_pixels(temp, src1, src2, stride);
3882
3883     memcpy(bak, temp, 64*sizeof(DCTELEM));
3884
3885     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3886     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3887     ff_simple_idct(temp); //FIXME
3888
3889     for(i=0; i<64; i++)
3890         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3891
3892     return sum;
3893 }
3894
3895 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3896     MpegEncContext * const s= (MpegEncContext *)c;
3897     const uint8_t *scantable= s->intra_scantable.permutated;
3898     DECLARE_ALIGNED_16(uint64_t, aligned_temp)[sizeof(DCTELEM)*64/8];
3899     DECLARE_ALIGNED_16(uint64_t, aligned_src1)[8];
3900     DECLARE_ALIGNED_16(uint64_t, aligned_src2)[8];
3901     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3902     uint8_t * const lsrc1 = (uint8_t*)aligned_src1;
3903     uint8_t * const lsrc2 = (uint8_t*)aligned_src2;
3904     int i, last, run, bits, level, distortion, start_i;
3905     const int esc_length= s->ac_esc_length;
3906     uint8_t * length;
3907     uint8_t * last_length;
3908
3909     assert(h==8);
3910
3911     copy_block8(lsrc1, src1, 8, stride, 8);
3912     copy_block8(lsrc2, src2, 8, stride, 8);
3913
3914     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3915
3916     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3917
3918     bits=0;
3919
3920     if (s->mb_intra) {
3921         start_i = 1;
3922         length     = s->intra_ac_vlc_length;
3923         last_length= s->intra_ac_vlc_last_length;
3924         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3925     } else {
3926         start_i = 0;
3927         length     = s->inter_ac_vlc_length;
3928         last_length= s->inter_ac_vlc_last_length;
3929     }
3930
3931     if(last>=start_i){
3932         run=0;
3933         for(i=start_i; i<last; i++){
3934             int j= scantable[i];
3935             level= temp[j];
3936
3937             if(level){
3938                 level+=64;
3939                 if((level&(~127)) == 0){
3940                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3941                 }else
3942                     bits+= esc_length;
3943                 run=0;
3944             }else
3945                 run++;
3946         }
3947         i= scantable[last];
3948
3949         level= temp[i] + 64;
3950
3951         assert(level - 64);
3952
3953         if((level&(~127)) == 0){
3954             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3955         }else
3956             bits+= esc_length;
3957
3958     }
3959
3960     if(last>=0){
3961         if(s->mb_intra)
3962             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3963         else
3964             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3965     }
3966
3967     s->dsp.idct_add(lsrc2, 8, temp);
3968
3969     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3970
3971     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3972 }
3973
3974 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3975     MpegEncContext * const s= (MpegEncContext *)c;
3976     const uint8_t *scantable= s->intra_scantable.permutated;
3977     DECLARE_ALIGNED_16(uint64_t, aligned_temp)[sizeof(DCTELEM)*64/8];
3978     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3979     int i, last, run, bits, level, start_i;
3980     const int esc_length= s->ac_esc_length;
3981     uint8_t * length;
3982     uint8_t * last_length;
3983
3984     assert(h==8);
3985
3986     s->dsp.diff_pixels(temp, src1, src2, stride);
3987
3988     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3989
3990     bits=0;
3991
3992     if (s->mb_intra) {
3993         start_i = 1;
3994         length     = s->intra_ac_vlc_length;
3995         last_length= s->intra_ac_vlc_last_length;
3996         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3997     } else {
3998         start_i = 0;
3999         length     = s->inter_ac_vlc_length;
4000         last_length= s->inter_ac_vlc_last_length;
4001     }
4002
4003     if(last>=start_i){
4004         run=0;
4005         for(i=start_i; i<last; i++){
4006             int j= scantable[i];
4007             level= temp[j];
4008
4009             if(level){
4010                 level+=64;
4011                 if((level&(~127)) == 0){
4012                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
4013                 }else
4014                     bits+= esc_length;
4015                 run=0;
4016             }else
4017                 run++;
4018         }
4019         i= scantable[last];
4020
4021         level= temp[i] + 64;
4022
4023         assert(level - 64);
4024
4025         if((level&(~127)) == 0){
4026             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
4027         }else
4028             bits+= esc_length;
4029     }
4030
4031     return bits;
4032 }
4033
4034 #define VSAD_INTRA(size) \
4035 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4036     int score=0;                                                                                            \
4037     int x,y;                                                                                                \
4038                                                                                                             \
4039     for(y=1; y<h; y++){                                                                                     \
4040         for(x=0; x<size; x+=4){                                                                             \
4041             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
4042                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
4043         }                                                                                                   \
4044         s+= stride;                                                                                         \
4045     }                                                                                                       \
4046                                                                                                             \
4047     return score;                                                                                           \
4048 }
4049 VSAD_INTRA(8)
4050 VSAD_INTRA(16)
4051
4052 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4053     int score=0;
4054     int x,y;
4055
4056     for(y=1; y<h; y++){
4057         for(x=0; x<16; x++){
4058             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4059         }
4060         s1+= stride;
4061         s2+= stride;
4062     }
4063
4064     return score;
4065 }
4066
4067 #define SQ(a) ((a)*(a))
4068 #define VSSE_INTRA(size) \
4069 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4070     int score=0;                                                                                            \
4071     int x,y;                                                                                                \
4072                                                                                                             \
4073     for(y=1; y<h; y++){                                                                                     \
4074         for(x=0; x<size; x+=4){                                                                               \
4075             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
4076                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
4077         }                                                                                                   \
4078         s+= stride;                                                                                         \
4079     }                                                                                                       \
4080                                                                                                             \
4081     return score;                                                                                           \
4082 }
4083 VSSE_INTRA(8)
4084 VSSE_INTRA(16)
4085
4086 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4087     int score=0;
4088     int x,y;
4089
4090     for(y=1; y<h; y++){
4091         for(x=0; x<16; x++){
4092             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4093         }
4094         s1+= stride;
4095         s2+= stride;
4096     }
4097
4098     return score;
4099 }
4100
4101 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4102                                int size){
4103     int score=0;
4104     int i;
4105     for(i=0; i<size; i++)
4106         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4107     return score;
4108 }
4109
4110 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4111 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4112 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4113 #if CONFIG_GPL
4114 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4115 #endif
4116 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4117 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4118 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4119 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4120
4121 static void vector_fmul_c(float *dst, const float *src, int len){
4122     int i;
4123     for(i=0; i<len; i++)
4124         dst[i] *= src[i];
4125 }
4126
4127 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4128     int i;
4129     src1 += len-1;
4130     for(i=0; i<len; i++)
4131         dst[i] = src0[i] * src1[-i];
4132 }
4133
4134 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
4135     int i;
4136     for(i=0; i<len; i++)
4137         dst[i] = src0[i] * src1[i] + src2[i];
4138 }
4139
4140 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4141     int i,j;
4142     dst += len;
4143     win += len;
4144     src0+= len;
4145     for(i=-len, j=len-1; i<0; i++, j--) {
4146         float s0 = src0[i];
4147         float s1 = src1[j];
4148         float wi = win[i];
4149         float wj = win[j];
4150         dst[i] = s0*wj - s1*wi + add_bias;
4151         dst[j] = s0*wi + s1*wj + add_bias;
4152     }
4153 }
4154
4155 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
4156                                  int len)
4157 {
4158     int i;
4159     for (i = 0; i < len; i++)
4160         dst[i] = src[i] * mul;
4161 }
4162
4163 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
4164                                       const float **sv, float mul, int len)
4165 {
4166     int i;
4167     for (i = 0; i < len; i += 2, sv++) {
4168         dst[i  ] = src[i  ] * sv[0][0] * mul;
4169         dst[i+1] = src[i+1] * sv[0][1] * mul;
4170     }
4171 }
4172
4173 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
4174                                       const float **sv, float mul, int len)
4175 {
4176     int i;
4177     for (i = 0; i < len; i += 4, sv++) {
4178         dst[i  ] = src[i  ] * sv[0][0] * mul;
4179         dst[i+1] = src[i+1] * sv[0][1] * mul;
4180         dst[i+2] = src[i+2] * sv[0][2] * mul;
4181         dst[i+3] = src[i+3] * sv[0][3] * mul;
4182     }
4183 }
4184
4185 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
4186                                int len)
4187 {
4188     int i;
4189     for (i = 0; i < len; i += 2, sv++) {
4190         dst[i  ] = sv[0][0] * mul;
4191         dst[i+1] = sv[0][1] * mul;
4192     }
4193 }
4194
4195 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
4196                                int len)
4197 {
4198     int i;
4199     for (i = 0; i < len; i += 4, sv++) {
4200         dst[i  ] = sv[0][0] * mul;
4201         dst[i+1] = sv[0][1] * mul;
4202         dst[i+2] = sv[0][2] * mul;
4203         dst[i+3] = sv[0][3] * mul;
4204     }
4205 }
4206
4207 static void butterflies_float_c(float *restrict v1, float *restrict v2,
4208                                 int len)
4209 {
4210     int i;
4211     for (i = 0; i < len; i++) {
4212         float t = v1[i] - v2[i];
4213         v1[i] += v2[i];
4214         v2[i] = t;
4215     }
4216 }
4217
4218 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
4219 {
4220     float p = 0.0;
4221     int i;
4222
4223     for (i = 0; i < len; i++)
4224         p += v1[i] * v2[i];
4225
4226     return p;
4227 }
4228
4229 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4230     int i;
4231     for(i=0; i<len; i++)
4232         dst[i] = src[i] * mul;
4233 }
4234
4235 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
4236                    uint32_t maxi, uint32_t maxisign)
4237 {
4238
4239     if(a > mini) return mini;
4240     else if((a^(1<<31)) > maxisign) return maxi;
4241     else return a;
4242 }
4243
4244 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
4245     int i;
4246     uint32_t mini = *(uint32_t*)min;
4247     uint32_t maxi = *(uint32_t*)max;
4248     uint32_t maxisign = maxi ^ (1<<31);
4249     uint32_t *dsti = (uint32_t*)dst;
4250     const uint32_t *srci = (const uint32_t*)src;
4251     for(i=0; i<len; i+=8) {
4252         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
4253         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
4254         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
4255         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
4256         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
4257         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
4258         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
4259         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
4260     }
4261 }
4262 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
4263     int i;
4264     if(min < 0 && max > 0) {
4265         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
4266     } else {
4267         for(i=0; i < len; i+=8) {
4268             dst[i    ] = av_clipf(src[i    ], min, max);
4269             dst[i + 1] = av_clipf(src[i + 1], min, max);
4270             dst[i + 2] = av_clipf(src[i + 2], min, max);
4271             dst[i + 3] = av_clipf(src[i + 3], min, max);
4272             dst[i + 4] = av_clipf(src[i + 4], min, max);
4273             dst[i + 5] = av_clipf(src[i + 5], min, max);
4274             dst[i + 6] = av_clipf(src[i + 6], min, max);
4275             dst[i + 7] = av_clipf(src[i + 7], min, max);
4276         }
4277     }
4278 }
4279
4280 static av_always_inline int float_to_int16_one(const float *src){
4281     int_fast32_t tmp = *(const int32_t*)src;
4282     if(tmp & 0xf0000){
4283         tmp = (0x43c0ffff - tmp)>>31;
4284         // is this faster on some gcc/cpu combinations?
4285 //      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4286 //      else                 tmp = 0;
4287     }
4288     return tmp - 0x8000;
4289 }
4290
4291 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4292     int i;
4293     for(i=0; i<len; i++)
4294         dst[i] = float_to_int16_one(src+i);
4295 }
4296
4297 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4298     int i,j,c;
4299     if(channels==2){
4300         for(i=0; i<len; i++){
4301             dst[2*i]   = float_to_int16_one(src[0]+i);
4302             dst[2*i+1] = float_to_int16_one(src[1]+i);
4303         }
4304     }else{
4305         for(c=0; c<channels; c++)
4306             for(i=0, j=c; i<len; i++, j+=channels)
4307                 dst[j] = float_to_int16_one(src[c]+i);
4308     }
4309 }
4310
4311 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4312 {
4313     int res = 0;
4314
4315     while (order--)
4316         res += (*v1++ * *v2++) >> shift;
4317
4318     return res;
4319 }
4320
4321 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
4322 {
4323     int res = 0;
4324     while (order--) {
4325         res   += *v1 * *v2++;
4326         *v1++ += mul * *v3++;
4327     }
4328     return res;
4329 }
4330
4331 #define W0 2048
4332 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4333 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4334 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4335 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4336 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4337 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4338 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4339
4340 static void wmv2_idct_row(short * b)
4341 {
4342     int s1,s2;
4343     int a0,a1,a2,a3,a4,a5,a6,a7;
4344     /*step 1*/
4345     a1 = W1*b[1]+W7*b[7];
4346     a7 = W7*b[1]-W1*b[7];
4347     a5 = W5*b[5]+W3*b[3];
4348     a3 = W3*b[5]-W5*b[3];
4349     a2 = W2*b[2]+W6*b[6];
4350     a6 = W6*b[2]-W2*b[6];
4351     a0 = W0*b[0]+W0*b[4];
4352     a4 = W0*b[0]-W0*b[4];
4353     /*step 2*/
4354     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4355     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4356     /*step 3*/
4357     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4358     b[1] = (a4+a6 +s1   + (1<<7))>>8;
4359     b[2] = (a4-a6 +s2   + (1<<7))>>8;
4360     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4361     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4362     b[5] = (a4-a6 -s2   + (1<<7))>>8;
4363     b[6] = (a4+a6 -s1   + (1<<7))>>8;
4364     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4365 }
4366 static void wmv2_idct_col(short * b)
4367 {
4368     int s1,s2;
4369     int a0,a1,a2,a3,a4,a5,a6,a7;
4370     /*step 1, with extended precision*/
4371     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4372     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4373     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4374     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4375     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4376     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4377     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4378     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4379     /*step 2*/
4380     s1 = (181*(a1-a5+a7-a3)+128)>>8;
4381     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4382     /*step 3*/
4383     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4384     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4385     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4386     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4387
4388     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4389     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4390     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4391     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4392 }
4393 void ff_wmv2_idct_c(short * block){
4394     int i;
4395
4396     for(i=0;i<64;i+=8){
4397         wmv2_idct_row(block+i);
4398     }
4399     for(i=0;i<8;i++){
4400         wmv2_idct_col(block+i);
4401     }
4402 }
4403 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4404  converted */
4405 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4406 {
4407     ff_wmv2_idct_c(block);
4408     put_pixels_clamped_c(block, dest, line_size);
4409 }
4410 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4411 {
4412     ff_wmv2_idct_c(block);
4413     add_pixels_clamped_c(block, dest, line_size);
4414 }
4415 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4416 {
4417     j_rev_dct (block);
4418     put_pixels_clamped_c(block, dest, line_size);
4419 }
4420 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4421 {
4422     j_rev_dct (block);
4423     add_pixels_clamped_c(block, dest, line_size);
4424 }
4425
4426 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4427 {
4428     j_rev_dct4 (block);
4429     put_pixels_clamped4_c(block, dest, line_size);
4430 }
4431 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4432 {
4433     j_rev_dct4 (block);
4434     add_pixels_clamped4_c(block, dest, line_size);
4435 }
4436
4437 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4438 {
4439     j_rev_dct2 (block);
4440     put_pixels_clamped2_c(block, dest, line_size);
4441 }
4442 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4443 {
4444     j_rev_dct2 (block);
4445     add_pixels_clamped2_c(block, dest, line_size);
4446 }
4447
4448 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4449 {
4450     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4451
4452     dest[0] = cm[(block[0] + 4)>>3];
4453 }
4454 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4455 {
4456     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4457
4458     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4459 }
4460
4461 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4462
4463 /* init static data */
4464 av_cold void dsputil_static_init(void)
4465 {
4466     int i;
4467
4468     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4469     for(i=0;i<MAX_NEG_CROP;i++) {
4470         ff_cropTbl[i] = 0;
4471         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4472     }
4473
4474     for(i=0;i<512;i++) {
4475         ff_squareTbl[i] = (i - 256) * (i - 256);
4476     }
4477
4478     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4479 }
4480
4481 int ff_check_alignment(void){
4482     static int did_fail=0;
4483     DECLARE_ALIGNED_16(int, aligned);
4484
4485     if((intptr_t)&aligned & 15){
4486         if(!did_fail){
4487 #if HAVE_MMX || HAVE_ALTIVEC
4488             av_log(NULL, AV_LOG_ERROR,
4489                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4490                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4491                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4492                 "Do not report crashes to FFmpeg developers.\n");
4493 #endif
4494             did_fail=1;
4495         }
4496         return -1;
4497     }
4498     return 0;
4499 }
4500
4501 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4502 {
4503     int i;
4504
4505     ff_check_alignment();
4506
4507 #if CONFIG_ENCODERS
4508     if(avctx->dct_algo==FF_DCT_FASTINT) {
4509         c->fdct = fdct_ifast;
4510         c->fdct248 = fdct_ifast248;
4511     }
4512     else if(avctx->dct_algo==FF_DCT_FAAN) {
4513         c->fdct = ff_faandct;
4514         c->fdct248 = ff_faandct248;
4515     }
4516     else {
4517         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4518         c->fdct248 = ff_fdct248_islow;
4519     }
4520 #endif //CONFIG_ENCODERS
4521
4522     if(avctx->lowres==1){
4523         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4524             c->idct_put= ff_jref_idct4_put;
4525             c->idct_add= ff_jref_idct4_add;
4526         }else{
4527             c->idct_put= ff_h264_lowres_idct_put_c;
4528             c->idct_add= ff_h264_lowres_idct_add_c;
4529         }
4530         c->idct    = j_rev_dct4;
4531         c->idct_permutation_type= FF_NO_IDCT_PERM;
4532     }else if(avctx->lowres==2){
4533         c->idct_put= ff_jref_idct2_put;
4534         c->idct_add= ff_jref_idct2_add;
4535         c->idct    = j_rev_dct2;
4536         c->idct_permutation_type= FF_NO_IDCT_PERM;
4537     }else if(avctx->lowres==3){
4538         c->idct_put= ff_jref_idct1_put;
4539         c->idct_add= ff_jref_idct1_add;
4540         c->idct    = j_rev_dct1;
4541         c->idct_permutation_type= FF_NO_IDCT_PERM;
4542     }else{
4543         if(avctx->idct_algo==FF_IDCT_INT){
4544             c->idct_put= ff_jref_idct_put;
4545             c->idct_add= ff_jref_idct_add;
4546             c->idct    = j_rev_dct;
4547             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4548         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4549                 avctx->idct_algo==FF_IDCT_VP3){
4550             c->idct_put= ff_vp3_idct_put_c;
4551             c->idct_add= ff_vp3_idct_add_c;
4552             c->idct    = ff_vp3_idct_c;
4553             c->idct_permutation_type= FF_NO_IDCT_PERM;
4554         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4555             c->idct_put= ff_wmv2_idct_put_c;
4556             c->idct_add= ff_wmv2_idct_add_c;
4557             c->idct    = ff_wmv2_idct_c;
4558             c->idct_permutation_type= FF_NO_IDCT_PERM;
4559         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4560             c->idct_put= ff_faanidct_put;
4561             c->idct_add= ff_faanidct_add;
4562             c->idct    = ff_faanidct;
4563             c->idct_permutation_type= FF_NO_IDCT_PERM;
4564         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4565             c->idct_put= ff_ea_idct_put_c;
4566             c->idct_permutation_type= FF_NO_IDCT_PERM;
4567         }else{ //accurate/default
4568             c->idct_put= ff_simple_idct_put;
4569             c->idct_add= ff_simple_idct_add;
4570             c->idct    = ff_simple_idct;
4571             c->idct_permutation_type= FF_NO_IDCT_PERM;
4572         }
4573     }
4574
4575     if (CONFIG_H264_DECODER) {
4576         c->h264_idct_add= ff_h264_idct_add_c;
4577         c->h264_idct8_add= ff_h264_idct8_add_c;
4578         c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4579         c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4580         c->h264_idct_add16     = ff_h264_idct_add16_c;
4581         c->h264_idct8_add4     = ff_h264_idct8_add4_c;
4582         c->h264_idct_add8      = ff_h264_idct_add8_c;
4583         c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
4584     }
4585
4586     c->get_pixels = get_pixels_c;
4587     c->diff_pixels = diff_pixels_c;
4588     c->put_pixels_clamped = put_pixels_clamped_c;
4589     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4590     c->add_pixels_clamped = add_pixels_clamped_c;
4591     c->add_pixels8 = add_pixels8_c;
4592     c->add_pixels4 = add_pixels4_c;
4593     c->sum_abs_dctelem = sum_abs_dctelem_c;
4594     c->gmc1 = gmc1_c;
4595     c->gmc = ff_gmc_c;
4596     c->clear_block = clear_block_c;
4597     c->clear_blocks = clear_blocks_c;
4598     c->pix_sum = pix_sum_c;
4599     c->pix_norm1 = pix_norm1_c;
4600
4601     /* TODO [0] 16  [1] 8 */
4602     c->pix_abs[0][0] = pix_abs16_c;
4603     c->pix_abs[0][1] = pix_abs16_x2_c;
4604     c->pix_abs[0][2] = pix_abs16_y2_c;
4605     c->pix_abs[0][3] = pix_abs16_xy2_c;
4606     c->pix_abs[1][0] = pix_abs8_c;
4607     c->pix_abs[1][1] = pix_abs8_x2_c;
4608     c->pix_abs[1][2] = pix_abs8_y2_c;
4609     c->pix_abs[1][3] = pix_abs8_xy2_c;
4610
4611 #define dspfunc(PFX, IDX, NUM) \
4612     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4613     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4614     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4615     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4616
4617     dspfunc(put, 0, 16);
4618     dspfunc(put_no_rnd, 0, 16);
4619     dspfunc(put, 1, 8);
4620     dspfunc(put_no_rnd, 1, 8);
4621     dspfunc(put, 2, 4);
4622     dspfunc(put, 3, 2);
4623
4624     dspfunc(avg, 0, 16);
4625     dspfunc(avg_no_rnd, 0, 16);
4626     dspfunc(avg, 1, 8);
4627     dspfunc(avg_no_rnd, 1, 8);
4628     dspfunc(avg, 2, 4);
4629     dspfunc(avg, 3, 2);
4630 #undef dspfunc
4631
4632     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4633     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4634
4635     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4636     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4637     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4638     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4639     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4640     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4641     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4642     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4643     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4644
4645     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4646     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4647     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4648     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4649     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4650     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4651     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4652     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4653     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4654
4655 #define dspfunc(PFX, IDX, NUM) \
4656     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4657     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4658     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4659     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4660     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4661     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4662     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4663     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4664     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4665     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4666     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4667     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4668     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4669     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4670     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4671     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4672
4673     dspfunc(put_qpel, 0, 16);
4674     dspfunc(put_no_rnd_qpel, 0, 16);
4675
4676     dspfunc(avg_qpel, 0, 16);
4677     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4678
4679     dspfunc(put_qpel, 1, 8);
4680     dspfunc(put_no_rnd_qpel, 1, 8);
4681
4682     dspfunc(avg_qpel, 1, 8);
4683     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4684
4685     dspfunc(put_h264_qpel, 0, 16);
4686     dspfunc(put_h264_qpel, 1, 8);
4687     dspfunc(put_h264_qpel, 2, 4);
4688     dspfunc(put_h264_qpel, 3, 2);
4689     dspfunc(avg_h264_qpel, 0, 16);
4690     dspfunc(avg_h264_qpel, 1, 8);
4691     dspfunc(avg_h264_qpel, 2, 4);
4692
4693 #undef dspfunc
4694     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4695     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4696     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4697     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4698     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4699     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4700     c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4701     c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4702
4703     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4704     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4705     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4706     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4707     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4708     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4709     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4710     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4711     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4712     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4713     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4714     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4715     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4716     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4717     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4718     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4719     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4720     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4721     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4722     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4723
4724     c->draw_edges = draw_edges_c;
4725
4726 #if CONFIG_CAVS_DECODER
4727     ff_cavsdsp_init(c,avctx);
4728 #endif
4729
4730 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4731     ff_mlp_init(c, avctx);
4732 #endif
4733 #if CONFIG_VC1_DECODER
4734     ff_vc1dsp_init(c,avctx);
4735 #endif
4736 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4737     ff_intrax8dsp_init(c,avctx);
4738 #endif
4739 #if CONFIG_RV30_DECODER
4740     ff_rv30dsp_init(c,avctx);
4741 #endif
4742 #if CONFIG_RV40_DECODER
4743     ff_rv40dsp_init(c,avctx);
4744     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4745     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4746     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4747     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4748 #endif
4749
4750     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4751     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4752     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4753     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4754     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4755     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4756     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4757     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4758
4759 #define SET_CMP_FUNC(name) \
4760     c->name[0]= name ## 16_c;\
4761     c->name[1]= name ## 8x8_c;
4762
4763     SET_CMP_FUNC(hadamard8_diff)
4764     c->hadamard8_diff[4]= hadamard8_intra16_c;
4765     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4766     SET_CMP_FUNC(dct_sad)
4767     SET_CMP_FUNC(dct_max)
4768 #if CONFIG_GPL
4769     SET_CMP_FUNC(dct264_sad)
4770 #endif
4771     c->sad[0]= pix_abs16_c;
4772     c->sad[1]= pix_abs8_c;
4773     c->sse[0]= sse16_c;
4774     c->sse[1]= sse8_c;
4775     c->sse[2]= sse4_c;
4776     SET_CMP_FUNC(quant_psnr)
4777     SET_CMP_FUNC(rd)
4778     SET_CMP_FUNC(bit)
4779     c->vsad[0]= vsad16_c;
4780     c->vsad[4]= vsad_intra16_c;
4781     c->vsad[5]= vsad_intra8_c;
4782     c->vsse[0]= vsse16_c;
4783     c->vsse[4]= vsse_intra16_c;
4784     c->vsse[5]= vsse_intra8_c;
4785     c->nsse[0]= nsse16_c;
4786     c->nsse[1]= nsse8_c;
4787 #if CONFIG_SNOW_ENCODER
4788     c->w53[0]= w53_16_c;
4789     c->w53[1]= w53_8_c;
4790     c->w97[0]= w97_16_c;
4791     c->w97[1]= w97_8_c;
4792 #endif
4793
4794     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4795
4796     c->add_bytes= add_bytes_c;
4797     c->add_bytes_l2= add_bytes_l2_c;
4798     c->diff_bytes= diff_bytes_c;
4799     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4800     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4801     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
4802     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4803     c->bswap_buf= bswap_buf;
4804 #if CONFIG_PNG_DECODER
4805     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4806 #endif
4807
4808     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4809     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4810     c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
4811     c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
4812     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4813     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4814     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4815     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4816     c->h264_loop_filter_strength= NULL;
4817
4818     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4819         c->h263_h_loop_filter= h263_h_loop_filter_c;
4820         c->h263_v_loop_filter= h263_v_loop_filter_c;
4821     }
4822
4823     if (CONFIG_VP3_DECODER) {
4824         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4825         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4826     }
4827     if (CONFIG_VP6_DECODER) {
4828         c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4829     }
4830
4831     c->h261_loop_filter= h261_loop_filter_c;
4832
4833     c->try_8x8basis= try_8x8basis_c;
4834     c->add_8x8basis= add_8x8basis_c;
4835
4836 #if CONFIG_SNOW_DECODER
4837     c->vertical_compose97i = ff_snow_vertical_compose97i;
4838     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4839     c->inner_add_yblock = ff_snow_inner_add_yblock;
4840 #endif
4841
4842 #if CONFIG_VORBIS_DECODER
4843     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4844 #endif
4845 #if CONFIG_AC3_DECODER
4846     c->ac3_downmix = ff_ac3_downmix_c;
4847 #endif
4848 #if CONFIG_LPC
4849     c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
4850 #endif
4851     c->vector_fmul = vector_fmul_c;
4852     c->vector_fmul_reverse = vector_fmul_reverse_c;
4853     c->vector_fmul_add = vector_fmul_add_c;
4854     c->vector_fmul_window = ff_vector_fmul_window_c;
4855     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4856     c->vector_clipf = vector_clipf_c;
4857     c->float_to_int16 = ff_float_to_int16_c;
4858     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4859     c->scalarproduct_int16 = scalarproduct_int16_c;
4860     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4861     c->scalarproduct_float = scalarproduct_float_c;
4862     c->butterflies_float = butterflies_float_c;
4863     c->vector_fmul_scalar = vector_fmul_scalar_c;
4864
4865     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4866     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4867
4868     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4869     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4870
4871     c->shrink[0]= ff_img_copy_plane;
4872     c->shrink[1]= ff_shrink22;
4873     c->shrink[2]= ff_shrink44;
4874     c->shrink[3]= ff_shrink88;
4875
4876     c->prefetch= just_return;
4877
4878     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4879     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4880
4881     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4882     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4883     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4884     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4885     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4886     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4887     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4888     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4889     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4890
4891     for(i=0; i<64; i++){
4892         if(!c->put_2tap_qpel_pixels_tab[0][i])
4893             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4894         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4895             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4896     }
4897
4898     switch(c->idct_permutation_type){
4899     case FF_NO_IDCT_PERM:
4900         for(i=0; i<64; i++)
4901             c->idct_permutation[i]= i;
4902         break;
4903     case FF_LIBMPEG2_IDCT_PERM:
4904         for(i=0; i<64; i++)
4905             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4906         break;
4907     case FF_SIMPLE_IDCT_PERM:
4908         for(i=0; i<64; i++)
4909             c->idct_permutation[i]= simple_mmx_permutation[i];
4910         break;
4911     case FF_TRANSPOSE_IDCT_PERM:
4912         for(i=0; i<64; i++)
4913             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4914         break;
4915     case FF_PARTTRANS_IDCT_PERM:
4916         for(i=0; i<64; i++)
4917             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4918         break;
4919     case FF_SSE2_IDCT_PERM:
4920         for(i=0; i<64; i++)
4921             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4922         break;
4923     default:
4924         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4925     }
4926 }
4927