git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  *
  20  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  21  */
  22
  23 /**
  24  * @file dsputil.c
  25  * DSP utils
  26  */
  27
  28 #include "avcodec.h"
  29 #include "dsputil.h"
  30 #include "mpegvideo.h"
  31 #include "simple_idct.h"
  32 #include "faandct.h"
  33 #include "snow.h"
  34
  35 /* snow.c */
  36 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  37
  38 /* vorbis.c */
  39 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
  40
  41 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  42 uint32_t squareTbl[512] = {0, };
  43
  44 const uint8_t ff_zigzag_direct[64] = {
  45     0,   1,  8, 16,  9,  2,  3, 10,
  46     17, 24, 32, 25, 18, 11,  4,  5,
  47     12, 19, 26, 33, 40, 48, 41, 34,
  48     27, 20, 13,  6,  7, 14, 21, 28,
  49     35, 42, 49, 56, 57, 50, 43, 36,
  50     29, 22, 15, 23, 30, 37, 44, 51,
  51     58, 59, 52, 45, 38, 31, 39, 46,
  52     53, 60, 61, 54, 47, 55, 62, 63
  53 };
  54
  55 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  56    specification, we interleave the fields */
  57 const uint8_t ff_zigzag248_direct[64] = {
  58      0,  8,  1,  9, 16, 24,  2, 10,
  59     17, 25, 32, 40, 48, 56, 33, 41,
  60     18, 26,  3, 11,  4, 12, 19, 27,
  61     34, 42, 49, 57, 50, 58, 35, 43,
  62     20, 28,  5, 13,  6, 14, 21, 29,
  63     36, 44, 51, 59, 52, 60, 37, 45,
  64     22, 30,  7, 15, 23, 31, 38, 46,
  65     53, 61, 54, 62, 39, 47, 55, 63,
  66 };
  67
  68 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  69 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
  70
  71 const uint8_t ff_alternate_horizontal_scan[64] = {
  72     0,  1,   2,  3,  8,  9, 16, 17,
  73     10, 11,  4,  5,  6,  7, 15, 14,
  74     13, 12, 19, 18, 24, 25, 32, 33,
  75     26, 27, 20, 21, 22, 23, 28, 29,
  76     30, 31, 34, 35, 40, 41, 48, 49,
  77     42, 43, 36, 37, 38, 39, 44, 45,
  78     46, 47, 50, 51, 56, 57, 58, 59,
  79     52, 53, 54, 55, 60, 61, 62, 63,
  80 };
  81
  82 const uint8_t ff_alternate_vertical_scan[64] = {
  83     0,  8,  16, 24,  1,  9,  2, 10,
  84     17, 25, 32, 40, 48, 56, 57, 49,
  85     41, 33, 26, 18,  3, 11,  4, 12,
  86     19, 27, 34, 42, 50, 58, 35, 43,
  87     51, 59, 20, 28,  5, 13,  6, 14,
  88     21, 29, 36, 44, 52, 60, 37, 45,
  89     53, 61, 22, 30,  7, 15, 23, 31,
  90     38, 46, 54, 62, 39, 47, 55, 63,
  91 };
  92
  93 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  94 const uint32_t inverse[256]={
  95          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  96  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
  97  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
  98  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
  99  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 100  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 101   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 102   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 103   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 104   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 105   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 106   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 107   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 108   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 109   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 110   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 111   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 112   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 113   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 114   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 115   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 116   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 117   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 118   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 119   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 120   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 121   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 122   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 123   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 124   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 125   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 126   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 127 };
 128
 129 /* Input permutation for the simple_idct_mmx */
 130 static const uint8_t simple_mmx_permutation[64]={
 131         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 132         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 133         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 134         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 135         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 136         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 137         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 138         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 139 };
 140
 141 static int pix_sum_c(uint8_t * pix, int line_size)
 142 {
 143     int s, i, j;
 144
 145     s = 0;
 146     for (i = 0; i < 16; i++) {
 147         for (j = 0; j < 16; j += 8) {
 148             s += pix[0];
 149             s += pix[1];
 150             s += pix[2];
 151             s += pix[3];
 152             s += pix[4];
 153             s += pix[5];
 154             s += pix[6];
 155             s += pix[7];
 156             pix += 8;
 157         }
 158         pix += line_size - 16;
 159     }
 160     return s;
 161 }
 162
 163 static int pix_norm1_c(uint8_t * pix, int line_size)
 164 {
 165     int s, i, j;
 166     uint32_t *sq = squareTbl + 256;
 167
 168     s = 0;
 169     for (i = 0; i < 16; i++) {
 170         for (j = 0; j < 16; j += 8) {
 171 #if 0
 172             s += sq[pix[0]];
 173             s += sq[pix[1]];
 174             s += sq[pix[2]];
 175             s += sq[pix[3]];
 176             s += sq[pix[4]];
 177             s += sq[pix[5]];
 178             s += sq[pix[6]];
 179             s += sq[pix[7]];
 180 #else
 181 #if LONG_MAX > 2147483647
 182             register uint64_t x=*(uint64_t*)pix;
 183             s += sq[x&0xff];
 184             s += sq[(x>>8)&0xff];
 185             s += sq[(x>>16)&0xff];
 186             s += sq[(x>>24)&0xff];
 187             s += sq[(x>>32)&0xff];
 188             s += sq[(x>>40)&0xff];
 189             s += sq[(x>>48)&0xff];
 190             s += sq[(x>>56)&0xff];
 191 #else
 192             register uint32_t x=*(uint32_t*)pix;
 193             s += sq[x&0xff];
 194             s += sq[(x>>8)&0xff];
 195             s += sq[(x>>16)&0xff];
 196             s += sq[(x>>24)&0xff];
 197             x=*(uint32_t*)(pix+4);
 198             s += sq[x&0xff];
 199             s += sq[(x>>8)&0xff];
 200             s += sq[(x>>16)&0xff];
 201             s += sq[(x>>24)&0xff];
 202 #endif
 203 #endif
 204             pix += 8;
 205         }
 206         pix += line_size - 16;
 207     }
 208     return s;
 209 }
 210
 211 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 212     int i;
 213
 214     for(i=0; i+8<=w; i+=8){
 215         dst[i+0]= bswap_32(src[i+0]);
 216         dst[i+1]= bswap_32(src[i+1]);
 217         dst[i+2]= bswap_32(src[i+2]);
 218         dst[i+3]= bswap_32(src[i+3]);
 219         dst[i+4]= bswap_32(src[i+4]);
 220         dst[i+5]= bswap_32(src[i+5]);
 221         dst[i+6]= bswap_32(src[i+6]);
 222         dst[i+7]= bswap_32(src[i+7]);
 223     }
 224     for(;i<w; i++){
 225         dst[i+0]= bswap_32(src[i+0]);
 226     }
 227 }
 228
 229 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 230 {
 231     int s, i;
 232     uint32_t *sq = squareTbl + 256;
 233
 234     s = 0;
 235     for (i = 0; i < h; i++) {
 236         s += sq[pix1[0] - pix2[0]];
 237         s += sq[pix1[1] - pix2[1]];
 238         s += sq[pix1[2] - pix2[2]];
 239         s += sq[pix1[3] - pix2[3]];
 240         pix1 += line_size;
 241         pix2 += line_size;
 242     }
 243     return s;
 244 }
 245
 246 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 247 {
 248     int s, i;
 249     uint32_t *sq = squareTbl + 256;
 250
 251     s = 0;
 252     for (i = 0; i < h; i++) {
 253         s += sq[pix1[0] - pix2[0]];
 254         s += sq[pix1[1] - pix2[1]];
 255         s += sq[pix1[2] - pix2[2]];
 256         s += sq[pix1[3] - pix2[3]];
 257         s += sq[pix1[4] - pix2[4]];
 258         s += sq[pix1[5] - pix2[5]];
 259         s += sq[pix1[6] - pix2[6]];
 260         s += sq[pix1[7] - pix2[7]];
 261         pix1 += line_size;
 262         pix2 += line_size;
 263     }
 264     return s;
 265 }
 266
 267 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 268 {
 269     int s, i;
 270     uint32_t *sq = squareTbl + 256;
 271
 272     s = 0;
 273     for (i = 0; i < h; i++) {
 274         s += sq[pix1[ 0] - pix2[ 0]];
 275         s += sq[pix1[ 1] - pix2[ 1]];
 276         s += sq[pix1[ 2] - pix2[ 2]];
 277         s += sq[pix1[ 3] - pix2[ 3]];
 278         s += sq[pix1[ 4] - pix2[ 4]];
 279         s += sq[pix1[ 5] - pix2[ 5]];
 280         s += sq[pix1[ 6] - pix2[ 6]];
 281         s += sq[pix1[ 7] - pix2[ 7]];
 282         s += sq[pix1[ 8] - pix2[ 8]];
 283         s += sq[pix1[ 9] - pix2[ 9]];
 284         s += sq[pix1[10] - pix2[10]];
 285         s += sq[pix1[11] - pix2[11]];
 286         s += sq[pix1[12] - pix2[12]];
 287         s += sq[pix1[13] - pix2[13]];
 288         s += sq[pix1[14] - pix2[14]];
 289         s += sq[pix1[15] - pix2[15]];
 290
 291         pix1 += line_size;
 292         pix2 += line_size;
 293     }
 294     return s;
 295 }
 296
 297
 298 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
 299 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 300     int s, i, j;
 301     const int dec_count= w==8 ? 3 : 4;
 302     int tmp[32*32];
 303     int level, ori;
 304     static const int scale[2][2][4][4]={
 305       {
 306         {
 307             // 9/7 8x8 dec=3
 308             {268, 239, 239, 213},
 309             {  0, 224, 224, 152},
 310             {  0, 135, 135, 110},
 311         },{
 312             // 9/7 16x16 or 32x32 dec=4
 313             {344, 310, 310, 280},
 314             {  0, 320, 320, 228},
 315             {  0, 175, 175, 136},
 316             {  0, 129, 129, 102},
 317         }
 318       },{
 319         {
 320             // 5/3 8x8 dec=3
 321             {275, 245, 245, 218},
 322             {  0, 230, 230, 156},
 323             {  0, 138, 138, 113},
 324         },{
 325             // 5/3 16x16 or 32x32 dec=4
 326             {352, 317, 317, 286},
 327             {  0, 328, 328, 233},
 328             {  0, 180, 180, 140},
 329             {  0, 132, 132, 105},
 330         }
 331       }
 332     };
 333
 334     for (i = 0; i < h; i++) {
 335         for (j = 0; j < w; j+=4) {
 336             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 337             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 338             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 339             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 340         }
 341         pix1 += line_size;
 342         pix2 += line_size;
 343     }
 344
 345     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 346
 347     s=0;
 348     assert(w==h);
 349     for(level=0; level<dec_count; level++){
 350         for(ori= level ? 1 : 0; ori<4; ori++){
 351             int size= w>>(dec_count-level);
 352             int sx= (ori&1) ? size : 0;
 353             int stride= 32<<(dec_count-level);
 354             int sy= (ori&2) ? stride>>1 : 0;
 355
 356             for(i=0; i<size; i++){
 357                 for(j=0; j<size; j++){
 358                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 359                     s += ABS(v);
 360                 }
 361             }
 362         }
 363     }
 364     assert(s>=0);
 365     return s>>9;
 366 }
 367
 368 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 369     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 370 }
 371
 372 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 373     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 374 }
 375
 376 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 377     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 378 }
 379
 380 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 381     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 382 }
 383
 384 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 385     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 386 }
 387
 388 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 389     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 390 }
 391 #endif
 392
 393 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 394 {
 395     int i;
 396
 397     /* read the pixels */
 398     for(i=0;i<8;i++) {
 399         block[0] = pixels[0];
 400         block[1] = pixels[1];
 401         block[2] = pixels[2];
 402         block[3] = pixels[3];
 403         block[4] = pixels[4];
 404         block[5] = pixels[5];
 405         block[6] = pixels[6];
 406         block[7] = pixels[7];
 407         pixels += line_size;
 408         block += 8;
 409     }
 410 }
 411
 412 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 413                           const uint8_t *s2, int stride){
 414     int i;
 415
 416     /* read the pixels */
 417     for(i=0;i<8;i++) {
 418         block[0] = s1[0] - s2[0];
 419         block[1] = s1[1] - s2[1];
 420         block[2] = s1[2] - s2[2];
 421         block[3] = s1[3] - s2[3];
 422         block[4] = s1[4] - s2[4];
 423         block[5] = s1[5] - s2[5];
 424         block[6] = s1[6] - s2[6];
 425         block[7] = s1[7] - s2[7];
 426         s1 += stride;
 427         s2 += stride;
 428         block += 8;
 429     }
 430 }
 431
 432
 433 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 434                                  int line_size)
 435 {
 436     int i;
 437     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 438
 439     /* read the pixels */
 440     for(i=0;i<8;i++) {
 441         pixels[0] = cm[block[0]];
 442         pixels[1] = cm[block[1]];
 443         pixels[2] = cm[block[2]];
 444         pixels[3] = cm[block[3]];
 445         pixels[4] = cm[block[4]];
 446         pixels[5] = cm[block[5]];
 447         pixels[6] = cm[block[6]];
 448         pixels[7] = cm[block[7]];
 449
 450         pixels += line_size;
 451         block += 8;
 452     }
 453 }
 454
 455 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 456                                  int line_size)
 457 {
 458     int i;
 459     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 460
 461     /* read the pixels */
 462     for(i=0;i<4;i++) {
 463         pixels[0] = cm[block[0]];
 464         pixels[1] = cm[block[1]];
 465         pixels[2] = cm[block[2]];
 466         pixels[3] = cm[block[3]];
 467
 468         pixels += line_size;
 469         block += 8;
 470     }
 471 }
 472
 473 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 474                                  int line_size)
 475 {
 476     int i;
 477     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 478
 479     /* read the pixels */
 480     for(i=0;i<2;i++) {
 481         pixels[0] = cm[block[0]];
 482         pixels[1] = cm[block[1]];
 483
 484         pixels += line_size;
 485         block += 8;
 486     }
 487 }
 488
 489 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 490                                         uint8_t *restrict pixels,
 491                                         int line_size)
 492 {
 493     int i, j;
 494
 495     for (i = 0; i < 8; i++) {
 496         for (j = 0; j < 8; j++) {
 497             if (*block < -128)
 498                 *pixels = 0;
 499             else if (*block > 127)
 500                 *pixels = 255;
 501             else
 502                 *pixels = (uint8_t)(*block + 128);
 503             block++;
 504             pixels++;
 505         }
 506         pixels += (line_size - 8);
 507     }
 508 }
 509
 510 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 511                           int line_size)
 512 {
 513     int i;
 514     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 515
 516     /* read the pixels */
 517     for(i=0;i<8;i++) {
 518         pixels[0] = cm[pixels[0] + block[0]];
 519         pixels[1] = cm[pixels[1] + block[1]];
 520         pixels[2] = cm[pixels[2] + block[2]];
 521         pixels[3] = cm[pixels[3] + block[3]];
 522         pixels[4] = cm[pixels[4] + block[4]];
 523         pixels[5] = cm[pixels[5] + block[5]];
 524         pixels[6] = cm[pixels[6] + block[6]];
 525         pixels[7] = cm[pixels[7] + block[7]];
 526         pixels += line_size;
 527         block += 8;
 528     }
 529 }
 530
 531 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 532                           int line_size)
 533 {
 534     int i;
 535     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 536
 537     /* read the pixels */
 538     for(i=0;i<4;i++) {
 539         pixels[0] = cm[pixels[0] + block[0]];
 540         pixels[1] = cm[pixels[1] + block[1]];
 541         pixels[2] = cm[pixels[2] + block[2]];
 542         pixels[3] = cm[pixels[3] + block[3]];
 543         pixels += line_size;
 544         block += 8;
 545     }
 546 }
 547
 548 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 549                           int line_size)
 550 {
 551     int i;
 552     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 553
 554     /* read the pixels */
 555     for(i=0;i<2;i++) {
 556         pixels[0] = cm[pixels[0] + block[0]];
 557         pixels[1] = cm[pixels[1] + block[1]];
 558         pixels += line_size;
 559         block += 8;
 560     }
 561 }
 562
 563 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 564 {
 565     int i;
 566     for(i=0;i<8;i++) {
 567         pixels[0] += block[0];
 568         pixels[1] += block[1];
 569         pixels[2] += block[2];
 570         pixels[3] += block[3];
 571         pixels[4] += block[4];
 572         pixels[5] += block[5];
 573         pixels[6] += block[6];
 574         pixels[7] += block[7];
 575         pixels += line_size;
 576         block += 8;
 577     }
 578 }
 579
 580 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 581 {
 582     int i;
 583     for(i=0;i<4;i++) {
 584         pixels[0] += block[0];
 585         pixels[1] += block[1];
 586         pixels[2] += block[2];
 587         pixels[3] += block[3];
 588         pixels += line_size;
 589         block += 4;
 590     }
 591 }
 592
 593 #if 0
 594
 595 #define PIXOP2(OPNAME, OP) \
 596 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 597 {\
 598     int i;\
 599     for(i=0; i<h; i++){\
 600         OP(*((uint64_t*)block), LD64(pixels));\
 601         pixels+=line_size;\
 602         block +=line_size;\
 603     }\
 604 }\
 605 \
 606 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 607 {\
 608     int i;\
 609     for(i=0; i<h; i++){\
 610         const uint64_t a= LD64(pixels  );\
 611         const uint64_t b= LD64(pixels+1);\
 612         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 613         pixels+=line_size;\
 614         block +=line_size;\
 615     }\
 616 }\
 617 \
 618 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 619 {\
 620     int i;\
 621     for(i=0; i<h; i++){\
 622         const uint64_t a= LD64(pixels  );\
 623         const uint64_t b= LD64(pixels+1);\
 624         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 625         pixels+=line_size;\
 626         block +=line_size;\
 627     }\
 628 }\
 629 \
 630 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 631 {\
 632     int i;\
 633     for(i=0; i<h; i++){\
 634         const uint64_t a= LD64(pixels          );\
 635         const uint64_t b= LD64(pixels+line_size);\
 636         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 637         pixels+=line_size;\
 638         block +=line_size;\
 639     }\
 640 }\
 641 \
 642 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 643 {\
 644     int i;\
 645     for(i=0; i<h; i++){\
 646         const uint64_t a= LD64(pixels          );\
 647         const uint64_t b= LD64(pixels+line_size);\
 648         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 649         pixels+=line_size;\
 650         block +=line_size;\
 651     }\
 652 }\
 653 \
 654 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 655 {\
 656         int i;\
 657         const uint64_t a= LD64(pixels  );\
 658         const uint64_t b= LD64(pixels+1);\
 659         uint64_t l0=  (a&0x0303030303030303ULL)\
 660                     + (b&0x0303030303030303ULL)\
 661                     + 0x0202020202020202ULL;\
 662         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 663                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 664         uint64_t l1,h1;\
 665 \
 666         pixels+=line_size;\
 667         for(i=0; i<h; i+=2){\
 668             uint64_t a= LD64(pixels  );\
 669             uint64_t b= LD64(pixels+1);\
 670             l1=  (a&0x0303030303030303ULL)\
 671                + (b&0x0303030303030303ULL);\
 672             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 673               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 674             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 675             pixels+=line_size;\
 676             block +=line_size;\
 677             a= LD64(pixels  );\
 678             b= LD64(pixels+1);\
 679             l0=  (a&0x0303030303030303ULL)\
 680                + (b&0x0303030303030303ULL)\
 681                + 0x0202020202020202ULL;\
 682             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 683               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 684             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 685             pixels+=line_size;\
 686             block +=line_size;\
 687         }\
 688 }\
 689 \
 690 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 691 {\
 692         int i;\
 693         const uint64_t a= LD64(pixels  );\
 694         const uint64_t b= LD64(pixels+1);\
 695         uint64_t l0=  (a&0x0303030303030303ULL)\
 696                     + (b&0x0303030303030303ULL)\
 697                     + 0x0101010101010101ULL;\
 698         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 699                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 700         uint64_t l1,h1;\
 701 \
 702         pixels+=line_size;\
 703         for(i=0; i<h; i+=2){\
 704             uint64_t a= LD64(pixels  );\
 705             uint64_t b= LD64(pixels+1);\
 706             l1=  (a&0x0303030303030303ULL)\
 707                + (b&0x0303030303030303ULL);\
 708             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 709               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 710             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 711             pixels+=line_size;\
 712             block +=line_size;\
 713             a= LD64(pixels  );\
 714             b= LD64(pixels+1);\
 715             l0=  (a&0x0303030303030303ULL)\
 716                + (b&0x0303030303030303ULL)\
 717                + 0x0101010101010101ULL;\
 718             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 719               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 720             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 721             pixels+=line_size;\
 722             block +=line_size;\
 723         }\
 724 }\
 725 \
 726 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 727 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 728 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 729 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 730 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 731 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 732 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 733
 734 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 735 #else // 64 bit variant
 736
 737 #define PIXOP2(OPNAME, OP) \
 738 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 739     int i;\
 740     for(i=0; i<h; i++){\
 741         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
 742         pixels+=line_size;\
 743         block +=line_size;\
 744     }\
 745 }\
 746 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 747     int i;\
 748     for(i=0; i<h; i++){\
 749         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 750         pixels+=line_size;\
 751         block +=line_size;\
 752     }\
 753 }\
 754 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 755     int i;\
 756     for(i=0; i<h; i++){\
 757         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 758         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 759         pixels+=line_size;\
 760         block +=line_size;\
 761     }\
 762 }\
 763 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 764     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 765 }\
 766 \
 767 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 768                                                 int src_stride1, int src_stride2, int h){\
 769     int i;\
 770     for(i=0; i<h; i++){\
 771         uint32_t a,b;\
 772         a= LD32(&src1[i*src_stride1  ]);\
 773         b= LD32(&src2[i*src_stride2  ]);\
 774         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 775         a= LD32(&src1[i*src_stride1+4]);\
 776         b= LD32(&src2[i*src_stride2+4]);\
 777         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 778     }\
 779 }\
 780 \
 781 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 782                                                 int src_stride1, int src_stride2, int h){\
 783     int i;\
 784     for(i=0; i<h; i++){\
 785         uint32_t a,b;\
 786         a= LD32(&src1[i*src_stride1  ]);\
 787         b= LD32(&src2[i*src_stride2  ]);\
 788         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 789         a= LD32(&src1[i*src_stride1+4]);\
 790         b= LD32(&src2[i*src_stride2+4]);\
 791         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 792     }\
 793 }\
 794 \
 795 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 796                                                 int src_stride1, int src_stride2, int h){\
 797     int i;\
 798     for(i=0; i<h; i++){\
 799         uint32_t a,b;\
 800         a= LD32(&src1[i*src_stride1  ]);\
 801         b= LD32(&src2[i*src_stride2  ]);\
 802         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 803     }\
 804 }\
 805 \
 806 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 807                                                 int src_stride1, int src_stride2, int h){\
 808     int i;\
 809     for(i=0; i<h; i++){\
 810         uint32_t a,b;\
 811         a= LD16(&src1[i*src_stride1  ]);\
 812         b= LD16(&src2[i*src_stride2  ]);\
 813         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 814     }\
 815 }\
 816 \
 817 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 818                                                 int src_stride1, int src_stride2, int h){\
 819     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 820     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 821 }\
 822 \
 823 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 824                                                 int src_stride1, int src_stride2, int h){\
 825     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 826     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 827 }\
 828 \
 829 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 830     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 831 }\
 832 \
 833 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 834     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 835 }\
 836 \
 837 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 838     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 839 }\
 840 \
 841 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 842     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 843 }\
 844 \
 845 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 846                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 847     int i;\
 848     for(i=0; i<h; i++){\
 849         uint32_t a, b, c, d, l0, l1, h0, h1;\
 850         a= LD32(&src1[i*src_stride1]);\
 851         b= LD32(&src2[i*src_stride2]);\
 852         c= LD32(&src3[i*src_stride3]);\
 853         d= LD32(&src4[i*src_stride4]);\
 854         l0=  (a&0x03030303UL)\
 855            + (b&0x03030303UL)\
 856            + 0x02020202UL;\
 857         h0= ((a&0xFCFCFCFCUL)>>2)\
 858           + ((b&0xFCFCFCFCUL)>>2);\
 859         l1=  (c&0x03030303UL)\
 860            + (d&0x03030303UL);\
 861         h1= ((c&0xFCFCFCFCUL)>>2)\
 862           + ((d&0xFCFCFCFCUL)>>2);\
 863         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 864         a= LD32(&src1[i*src_stride1+4]);\
 865         b= LD32(&src2[i*src_stride2+4]);\
 866         c= LD32(&src3[i*src_stride3+4]);\
 867         d= LD32(&src4[i*src_stride4+4]);\
 868         l0=  (a&0x03030303UL)\
 869            + (b&0x03030303UL)\
 870            + 0x02020202UL;\
 871         h0= ((a&0xFCFCFCFCUL)>>2)\
 872           + ((b&0xFCFCFCFCUL)>>2);\
 873         l1=  (c&0x03030303UL)\
 874            + (d&0x03030303UL);\
 875         h1= ((c&0xFCFCFCFCUL)>>2)\
 876           + ((d&0xFCFCFCFCUL)>>2);\
 877         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 878     }\
 879 }\
 880 \
 881 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 882     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 883 }\
 884 \
 885 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 886     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 887 }\
 888 \
 889 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 890     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 891 }\
 892 \
 893 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 894     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 895 }\
 896 \
 897 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 898                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 899     int i;\
 900     for(i=0; i<h; i++){\
 901         uint32_t a, b, c, d, l0, l1, h0, h1;\
 902         a= LD32(&src1[i*src_stride1]);\
 903         b= LD32(&src2[i*src_stride2]);\
 904         c= LD32(&src3[i*src_stride3]);\
 905         d= LD32(&src4[i*src_stride4]);\
 906         l0=  (a&0x03030303UL)\
 907            + (b&0x03030303UL)\
 908            + 0x01010101UL;\
 909         h0= ((a&0xFCFCFCFCUL)>>2)\
 910           + ((b&0xFCFCFCFCUL)>>2);\
 911         l1=  (c&0x03030303UL)\
 912            + (d&0x03030303UL);\
 913         h1= ((c&0xFCFCFCFCUL)>>2)\
 914           + ((d&0xFCFCFCFCUL)>>2);\
 915         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 916         a= LD32(&src1[i*src_stride1+4]);\
 917         b= LD32(&src2[i*src_stride2+4]);\
 918         c= LD32(&src3[i*src_stride3+4]);\
 919         d= LD32(&src4[i*src_stride4+4]);\
 920         l0=  (a&0x03030303UL)\
 921            + (b&0x03030303UL)\
 922            + 0x01010101UL;\
 923         h0= ((a&0xFCFCFCFCUL)>>2)\
 924           + ((b&0xFCFCFCFCUL)>>2);\
 925         l1=  (c&0x03030303UL)\
 926            + (d&0x03030303UL);\
 927         h1= ((c&0xFCFCFCFCUL)>>2)\
 928           + ((d&0xFCFCFCFCUL)>>2);\
 929         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 930     }\
 931 }\
 932 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 933                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 934     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 935     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 936 }\
 937 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 938                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 939     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 940     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 941 }\
 942 \
 943 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 944 {\
 945         int i, a0, b0, a1, b1;\
 946         a0= pixels[0];\
 947         b0= pixels[1] + 2;\
 948         a0 += b0;\
 949         b0 += pixels[2];\
 950 \
 951         pixels+=line_size;\
 952         for(i=0; i<h; i+=2){\
 953             a1= pixels[0];\
 954             b1= pixels[1];\
 955             a1 += b1;\
 956             b1 += pixels[2];\
 957 \
 958             block[0]= (a1+a0)>>2; /* FIXME non put */\
 959             block[1]= (b1+b0)>>2;\
 960 \
 961             pixels+=line_size;\
 962             block +=line_size;\
 963 \
 964             a0= pixels[0];\
 965             b0= pixels[1] + 2;\
 966             a0 += b0;\
 967             b0 += pixels[2];\
 968 \
 969             block[0]= (a1+a0)>>2;\
 970             block[1]= (b1+b0)>>2;\
 971             pixels+=line_size;\
 972             block +=line_size;\
 973         }\
 974 }\
 975 \
 976 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 977 {\
 978         int i;\
 979         const uint32_t a= LD32(pixels  );\
 980         const uint32_t b= LD32(pixels+1);\
 981         uint32_t l0=  (a&0x03030303UL)\
 982                     + (b&0x03030303UL)\
 983                     + 0x02020202UL;\
 984         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 985                    + ((b&0xFCFCFCFCUL)>>2);\
 986         uint32_t l1,h1;\
 987 \
 988         pixels+=line_size;\
 989         for(i=0; i<h; i+=2){\
 990             uint32_t a= LD32(pixels  );\
 991             uint32_t b= LD32(pixels+1);\
 992             l1=  (a&0x03030303UL)\
 993                + (b&0x03030303UL);\
 994             h1= ((a&0xFCFCFCFCUL)>>2)\
 995               + ((b&0xFCFCFCFCUL)>>2);\
 996             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 997             pixels+=line_size;\
 998             block +=line_size;\
 999             a= LD32(pixels  );\
1000             b= LD32(pixels+1);\
1001             l0=  (a&0x03030303UL)\
1002                + (b&0x03030303UL)\
1003                + 0x02020202UL;\
1004             h0= ((a&0xFCFCFCFCUL)>>2)\
1005               + ((b&0xFCFCFCFCUL)>>2);\
1006             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1007             pixels+=line_size;\
1008             block +=line_size;\
1009         }\
1010 }\
1011 \
1012 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1013 {\
1014     int j;\
1015     for(j=0; j<2; j++){\
1016         int i;\
1017         const uint32_t a= LD32(pixels  );\
1018         const uint32_t b= LD32(pixels+1);\
1019         uint32_t l0=  (a&0x03030303UL)\
1020                     + (b&0x03030303UL)\
1021                     + 0x02020202UL;\
1022         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1023                    + ((b&0xFCFCFCFCUL)>>2);\
1024         uint32_t l1,h1;\
1025 \
1026         pixels+=line_size;\
1027         for(i=0; i<h; i+=2){\
1028             uint32_t a= LD32(pixels  );\
1029             uint32_t b= LD32(pixels+1);\
1030             l1=  (a&0x03030303UL)\
1031                + (b&0x03030303UL);\
1032             h1= ((a&0xFCFCFCFCUL)>>2)\
1033               + ((b&0xFCFCFCFCUL)>>2);\
1034             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1035             pixels+=line_size;\
1036             block +=line_size;\
1037             a= LD32(pixels  );\
1038             b= LD32(pixels+1);\
1039             l0=  (a&0x03030303UL)\
1040                + (b&0x03030303UL)\
1041                + 0x02020202UL;\
1042             h0= ((a&0xFCFCFCFCUL)>>2)\
1043               + ((b&0xFCFCFCFCUL)>>2);\
1044             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1045             pixels+=line_size;\
1046             block +=line_size;\
1047         }\
1048         pixels+=4-line_size*(h+1);\
1049         block +=4-line_size*h;\
1050     }\
1051 }\
1052 \
1053 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1054 {\
1055     int j;\
1056     for(j=0; j<2; j++){\
1057         int i;\
1058         const uint32_t a= LD32(pixels  );\
1059         const uint32_t b= LD32(pixels+1);\
1060         uint32_t l0=  (a&0x03030303UL)\
1061                     + (b&0x03030303UL)\
1062                     + 0x01010101UL;\
1063         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1064                    + ((b&0xFCFCFCFCUL)>>2);\
1065         uint32_t l1,h1;\
1066 \
1067         pixels+=line_size;\
1068         for(i=0; i<h; i+=2){\
1069             uint32_t a= LD32(pixels  );\
1070             uint32_t b= LD32(pixels+1);\
1071             l1=  (a&0x03030303UL)\
1072                + (b&0x03030303UL);\
1073             h1= ((a&0xFCFCFCFCUL)>>2)\
1074               + ((b&0xFCFCFCFCUL)>>2);\
1075             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1076             pixels+=line_size;\
1077             block +=line_size;\
1078             a= LD32(pixels  );\
1079             b= LD32(pixels+1);\
1080             l0=  (a&0x03030303UL)\
1081                + (b&0x03030303UL)\
1082                + 0x01010101UL;\
1083             h0= ((a&0xFCFCFCFCUL)>>2)\
1084               + ((b&0xFCFCFCFCUL)>>2);\
1085             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1086             pixels+=line_size;\
1087             block +=line_size;\
1088         }\
1089         pixels+=4-line_size*(h+1);\
1090         block +=4-line_size*h;\
1091     }\
1092 }\
1093 \
1094 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1095 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1096 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1097 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1098 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1099 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1100 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1101 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1102
1103 #define op_avg(a, b) a = rnd_avg32(a, b)
1104 #endif
1105 #define op_put(a, b) a = b
1106
1107 PIXOP2(avg, op_avg)
1108 PIXOP2(put, op_put)
1109 #undef op_avg
1110 #undef op_put
1111
1112 #define avg2(a,b) ((a+b+1)>>1)
1113 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1114
1115 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1116     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1117 }
1118
1119 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1120     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1121 }
1122
1123 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1124 {
1125     const int A=(16-x16)*(16-y16);
1126     const int B=(   x16)*(16-y16);
1127     const int C=(16-x16)*(   y16);
1128     const int D=(   x16)*(   y16);
1129     int i;
1130
1131     for(i=0; i<h; i++)
1132     {
1133         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1134         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1135         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1136         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1137         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1138         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1139         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1140         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1141         dst+= stride;
1142         src+= stride;
1143     }
1144 }
1145
1146 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1147                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1148 {
1149     int y, vx, vy;
1150     const int s= 1<<shift;
1151
1152     width--;
1153     height--;
1154
1155     for(y=0; y<h; y++){
1156         int x;
1157
1158         vx= ox;
1159         vy= oy;
1160         for(x=0; x<8; x++){ //XXX FIXME optimize
1161             int src_x, src_y, frac_x, frac_y, index;
1162
1163             src_x= vx>>16;
1164             src_y= vy>>16;
1165             frac_x= src_x&(s-1);
1166             frac_y= src_y&(s-1);
1167             src_x>>=shift;
1168             src_y>>=shift;
1169
1170             if((unsigned)src_x < width){
1171                 if((unsigned)src_y < height){
1172                     index= src_x + src_y*stride;
1173                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1174                                            + src[index       +1]*   frac_x )*(s-frac_y)
1175                                         + (  src[index+stride  ]*(s-frac_x)
1176                                            + src[index+stride+1]*   frac_x )*   frac_y
1177                                         + r)>>(shift*2);
1178                 }else{
1179                     index= src_x + clip(src_y, 0, height)*stride;
1180                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1181                                           + src[index       +1]*   frac_x )*s
1182                                         + r)>>(shift*2);
1183                 }
1184             }else{
1185                 if((unsigned)src_y < height){
1186                     index= clip(src_x, 0, width) + src_y*stride;
1187                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1188                                            + src[index+stride  ]*   frac_y )*s
1189                                         + r)>>(shift*2);
1190                 }else{
1191                     index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1192                     dst[y*stride + x]=    src[index         ];
1193                 }
1194             }
1195
1196             vx+= dxx;
1197             vy+= dyx;
1198         }
1199         ox += dxy;
1200         oy += dyy;
1201     }
1202 }
1203
1204 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1205     switch(width){
1206     case 2: put_pixels2_c (dst, src, stride, height); break;
1207     case 4: put_pixels4_c (dst, src, stride, height); break;
1208     case 8: put_pixels8_c (dst, src, stride, height); break;
1209     case 16:put_pixels16_c(dst, src, stride, height); break;
1210     }
1211 }
1212
1213 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1214     int i,j;
1215     for (i=0; i < height; i++) {
1216       for (j=0; j < width; j++) {
1217         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1218       }
1219       src += stride;
1220       dst += stride;
1221     }
1222 }
1223
1224 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1225     int i,j;
1226     for (i=0; i < height; i++) {
1227       for (j=0; j < width; j++) {
1228         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1229       }
1230       src += stride;
1231       dst += stride;
1232     }
1233 }
1234
1235 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1236     int i,j;
1237     for (i=0; i < height; i++) {
1238       for (j=0; j < width; j++) {
1239         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1240       }
1241       src += stride;
1242       dst += stride;
1243     }
1244 }
1245
1246 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1247     int i,j;
1248     for (i=0; i < height; i++) {
1249       for (j=0; j < width; j++) {
1250         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1251       }
1252       src += stride;
1253       dst += stride;
1254     }
1255 }
1256
1257 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1258     int i,j;
1259     for (i=0; i < height; i++) {
1260       for (j=0; j < width; j++) {
1261         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1262       }
1263       src += stride;
1264       dst += stride;
1265     }
1266 }
1267
1268 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1269     int i,j;
1270     for (i=0; i < height; i++) {
1271       for (j=0; j < width; j++) {
1272         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1273       }
1274       src += stride;
1275       dst += stride;
1276     }
1277 }
1278
1279 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280     int i,j;
1281     for (i=0; i < height; i++) {
1282       for (j=0; j < width; j++) {
1283         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1284       }
1285       src += stride;
1286       dst += stride;
1287     }
1288 }
1289
1290 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291     int i,j;
1292     for (i=0; i < height; i++) {
1293       for (j=0; j < width; j++) {
1294         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1295       }
1296       src += stride;
1297       dst += stride;
1298     }
1299 }
1300
1301 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1302     switch(width){
1303     case 2: avg_pixels2_c (dst, src, stride, height); break;
1304     case 4: avg_pixels4_c (dst, src, stride, height); break;
1305     case 8: avg_pixels8_c (dst, src, stride, height); break;
1306     case 16:avg_pixels16_c(dst, src, stride, height); break;
1307     }
1308 }
1309
1310 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1311     int i,j;
1312     for (i=0; i < height; i++) {
1313       for (j=0; j < width; j++) {
1314         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1315       }
1316       src += stride;
1317       dst += stride;
1318     }
1319 }
1320
1321 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1322     int i,j;
1323     for (i=0; i < height; i++) {
1324       for (j=0; j < width; j++) {
1325         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1326       }
1327       src += stride;
1328       dst += stride;
1329     }
1330 }
1331
1332 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1333     int i,j;
1334     for (i=0; i < height; i++) {
1335       for (j=0; j < width; j++) {
1336         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1337       }
1338       src += stride;
1339       dst += stride;
1340     }
1341 }
1342
1343 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1344     int i,j;
1345     for (i=0; i < height; i++) {
1346       for (j=0; j < width; j++) {
1347         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1348       }
1349       src += stride;
1350       dst += stride;
1351     }
1352 }
1353
1354 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355     int i,j;
1356     for (i=0; i < height; i++) {
1357       for (j=0; j < width; j++) {
1358         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1359       }
1360       src += stride;
1361       dst += stride;
1362     }
1363 }
1364
1365 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1366     int i,j;
1367     for (i=0; i < height; i++) {
1368       for (j=0; j < width; j++) {
1369         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1370       }
1371       src += stride;
1372       dst += stride;
1373     }
1374 }
1375
1376 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377     int i,j;
1378     for (i=0; i < height; i++) {
1379       for (j=0; j < width; j++) {
1380         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1381       }
1382       src += stride;
1383       dst += stride;
1384     }
1385 }
1386
1387 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1388     int i,j;
1389     for (i=0; i < height; i++) {
1390       for (j=0; j < width; j++) {
1391         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1392       }
1393       src += stride;
1394       dst += stride;
1395     }
1396 }
1397 #if 0
1398 #define TPEL_WIDTH(width)\
1399 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1400     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1401 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1402     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1403 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1404     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1405 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1406     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1407 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1408     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1409 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1411 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1413 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1417 #endif
1418
1419 #define H264_CHROMA_MC(OPNAME, OP)\
1420 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1421     const int A=(8-x)*(8-y);\
1422     const int B=(  x)*(8-y);\
1423     const int C=(8-x)*(  y);\
1424     const int D=(  x)*(  y);\
1425     int i;\
1426     \
1427     assert(x<8 && y<8 && x>=0 && y>=0);\
1428 \
1429     for(i=0; i<h; i++)\
1430     {\
1431         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1432         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1433         dst+= stride;\
1434         src+= stride;\
1435     }\
1436 }\
1437 \
1438 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1439     const int A=(8-x)*(8-y);\
1440     const int B=(  x)*(8-y);\
1441     const int C=(8-x)*(  y);\
1442     const int D=(  x)*(  y);\
1443     int i;\
1444     \
1445     assert(x<8 && y<8 && x>=0 && y>=0);\
1446 \
1447     for(i=0; i<h; i++)\
1448     {\
1449         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1450         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1451         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1452         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1453         dst+= stride;\
1454         src+= stride;\
1455     }\
1456 }\
1457 \
1458 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1459     const int A=(8-x)*(8-y);\
1460     const int B=(  x)*(8-y);\
1461     const int C=(8-x)*(  y);\
1462     const int D=(  x)*(  y);\
1463     int i;\
1464     \
1465     assert(x<8 && y<8 && x>=0 && y>=0);\
1466 \
1467     for(i=0; i<h; i++)\
1468     {\
1469         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1470         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1471         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1472         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1473         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1474         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1475         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1476         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1477         dst+= stride;\
1478         src+= stride;\
1479     }\
1480 }
1481
1482 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1483 #define op_put(a, b) a = (((b) + 32)>>6)
1484
1485 H264_CHROMA_MC(put_       , op_put)
1486 H264_CHROMA_MC(avg_       , op_avg)
1487 #undef op_avg
1488 #undef op_put
1489
1490 static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1491 {
1492     int i;
1493     for(i=0; i<h; i++)
1494     {
1495         ST16(dst   , LD16(src   ));
1496         dst+=dstStride;
1497         src+=srcStride;
1498     }
1499 }
1500
1501 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1502 {
1503     int i;
1504     for(i=0; i<h; i++)
1505     {
1506         ST32(dst   , LD32(src   ));
1507         dst+=dstStride;
1508         src+=srcStride;
1509     }
1510 }
1511
1512 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1513 {
1514     int i;
1515     for(i=0; i<h; i++)
1516     {
1517         ST32(dst   , LD32(src   ));
1518         ST32(dst+4 , LD32(src+4 ));
1519         dst+=dstStride;
1520         src+=srcStride;
1521     }
1522 }
1523
1524 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1525 {
1526     int i;
1527     for(i=0; i<h; i++)
1528     {
1529         ST32(dst   , LD32(src   ));
1530         ST32(dst+4 , LD32(src+4 ));
1531         ST32(dst+8 , LD32(src+8 ));
1532         ST32(dst+12, LD32(src+12));
1533         dst+=dstStride;
1534         src+=srcStride;
1535     }
1536 }
1537
1538 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1539 {
1540     int i;
1541     for(i=0; i<h; i++)
1542     {
1543         ST32(dst   , LD32(src   ));
1544         ST32(dst+4 , LD32(src+4 ));
1545         ST32(dst+8 , LD32(src+8 ));
1546         ST32(dst+12, LD32(src+12));
1547         dst[16]= src[16];
1548         dst+=dstStride;
1549         src+=srcStride;
1550     }
1551 }
1552
1553 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1554 {
1555     int i;
1556     for(i=0; i<h; i++)
1557     {
1558         ST32(dst   , LD32(src   ));
1559         ST32(dst+4 , LD32(src+4 ));
1560         dst[8]= src[8];
1561         dst+=dstStride;
1562         src+=srcStride;
1563     }
1564 }
1565
1566
1567 #define QPEL_MC(r, OPNAME, RND, OP) \
1568 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1569     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1570     int i;\
1571     for(i=0; i<h; i++)\
1572     {\
1573         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1574         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1575         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1576         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1577         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1578         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1579         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1580         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1581         dst+=dstStride;\
1582         src+=srcStride;\
1583     }\
1584 }\
1585 \
1586 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1587     const int w=8;\
1588     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1589     int i;\
1590     for(i=0; i<w; i++)\
1591     {\
1592         const int src0= src[0*srcStride];\
1593         const int src1= src[1*srcStride];\
1594         const int src2= src[2*srcStride];\
1595         const int src3= src[3*srcStride];\
1596         const int src4= src[4*srcStride];\
1597         const int src5= src[5*srcStride];\
1598         const int src6= src[6*srcStride];\
1599         const int src7= src[7*srcStride];\
1600         const int src8= src[8*srcStride];\
1601         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1602         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1603         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1604         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1605         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1606         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1607         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1608         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1609         dst++;\
1610         src++;\
1611     }\
1612 }\
1613 \
1614 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1615     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1616     int i;\
1617     \
1618     for(i=0; i<h; i++)\
1619     {\
1620         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1621         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1622         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1623         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1624         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1625         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1626         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1627         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1628         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1629         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1630         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1631         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1632         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1633         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1634         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1635         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1636         dst+=dstStride;\
1637         src+=srcStride;\
1638     }\
1639 }\
1640 \
1641 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1642     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1643     int i;\
1644     const int w=16;\
1645     for(i=0; i<w; i++)\
1646     {\
1647         const int src0= src[0*srcStride];\
1648         const int src1= src[1*srcStride];\
1649         const int src2= src[2*srcStride];\
1650         const int src3= src[3*srcStride];\
1651         const int src4= src[4*srcStride];\
1652         const int src5= src[5*srcStride];\
1653         const int src6= src[6*srcStride];\
1654         const int src7= src[7*srcStride];\
1655         const int src8= src[8*srcStride];\
1656         const int src9= src[9*srcStride];\
1657         const int src10= src[10*srcStride];\
1658         const int src11= src[11*srcStride];\
1659         const int src12= src[12*srcStride];\
1660         const int src13= src[13*srcStride];\
1661         const int src14= src[14*srcStride];\
1662         const int src15= src[15*srcStride];\
1663         const int src16= src[16*srcStride];\
1664         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1665         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1666         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1667         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1668         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1669         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1670         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1671         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1672         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1673         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1674         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1675         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1676         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1677         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1678         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1679         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1680         dst++;\
1681         src++;\
1682     }\
1683 }\
1684 \
1685 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1686     OPNAME ## pixels8_c(dst, src, stride, 8);\
1687 }\
1688 \
1689 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1690     uint8_t half[64];\
1691     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1692     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1693 }\
1694 \
1695 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1696     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1697 }\
1698 \
1699 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1700     uint8_t half[64];\
1701     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1702     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1703 }\
1704 \
1705 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1706     uint8_t full[16*9];\
1707     uint8_t half[64];\
1708     copy_block9(full, src, 16, stride, 9);\
1709     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1710     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1711 }\
1712 \
1713 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1714     uint8_t full[16*9];\
1715     copy_block9(full, src, 16, stride, 9);\
1716     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1717 }\
1718 \
1719 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1720     uint8_t full[16*9];\
1721     uint8_t half[64];\
1722     copy_block9(full, src, 16, stride, 9);\
1723     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1724     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1725 }\
1726 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1727     uint8_t full[16*9];\
1728     uint8_t halfH[72];\
1729     uint8_t halfV[64];\
1730     uint8_t halfHV[64];\
1731     copy_block9(full, src, 16, stride, 9);\
1732     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1733     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1734     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1735     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1736 }\
1737 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1738     uint8_t full[16*9];\
1739     uint8_t halfH[72];\
1740     uint8_t halfHV[64];\
1741     copy_block9(full, src, 16, stride, 9);\
1742     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1743     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1744     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1745     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1746 }\
1747 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1748     uint8_t full[16*9];\
1749     uint8_t halfH[72];\
1750     uint8_t halfV[64];\
1751     uint8_t halfHV[64];\
1752     copy_block9(full, src, 16, stride, 9);\
1753     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1754     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1755     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1756     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1757 }\
1758 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1759     uint8_t full[16*9];\
1760     uint8_t halfH[72];\
1761     uint8_t halfHV[64];\
1762     copy_block9(full, src, 16, stride, 9);\
1763     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1764     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1765     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1766     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1767 }\
1768 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1769     uint8_t full[16*9];\
1770     uint8_t halfH[72];\
1771     uint8_t halfV[64];\
1772     uint8_t halfHV[64];\
1773     copy_block9(full, src, 16, stride, 9);\
1774     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1775     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1776     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1777     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1778 }\
1779 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1780     uint8_t full[16*9];\
1781     uint8_t halfH[72];\
1782     uint8_t halfHV[64];\
1783     copy_block9(full, src, 16, stride, 9);\
1784     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1785     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1786     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1787     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1788 }\
1789 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1790     uint8_t full[16*9];\
1791     uint8_t halfH[72];\
1792     uint8_t halfV[64];\
1793     uint8_t halfHV[64];\
1794     copy_block9(full, src, 16, stride, 9);\
1795     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1796     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1797     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1798     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1799 }\
1800 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1801     uint8_t full[16*9];\
1802     uint8_t halfH[72];\
1803     uint8_t halfHV[64];\
1804     copy_block9(full, src, 16, stride, 9);\
1805     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1806     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1807     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1808     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1809 }\
1810 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1811     uint8_t halfH[72];\
1812     uint8_t halfHV[64];\
1813     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1814     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1815     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1816 }\
1817 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1818     uint8_t halfH[72];\
1819     uint8_t halfHV[64];\
1820     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1821     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1822     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1823 }\
1824 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1825     uint8_t full[16*9];\
1826     uint8_t halfH[72];\
1827     uint8_t halfV[64];\
1828     uint8_t halfHV[64];\
1829     copy_block9(full, src, 16, stride, 9);\
1830     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1831     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1832     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1833     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1834 }\
1835 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1836     uint8_t full[16*9];\
1837     uint8_t halfH[72];\
1838     copy_block9(full, src, 16, stride, 9);\
1839     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1840     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1841     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1842 }\
1843 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1844     uint8_t full[16*9];\
1845     uint8_t halfH[72];\
1846     uint8_t halfV[64];\
1847     uint8_t halfHV[64];\
1848     copy_block9(full, src, 16, stride, 9);\
1849     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1850     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1851     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1852     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1853 }\
1854 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1855     uint8_t full[16*9];\
1856     uint8_t halfH[72];\
1857     copy_block9(full, src, 16, stride, 9);\
1858     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1859     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1860     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1861 }\
1862 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1863     uint8_t halfH[72];\
1864     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1865     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1866 }\
1867 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1868     OPNAME ## pixels16_c(dst, src, stride, 16);\
1869 }\
1870 \
1871 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1872     uint8_t half[256];\
1873     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1874     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1875 }\
1876 \
1877 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1878     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1879 }\
1880 \
1881 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1882     uint8_t half[256];\
1883     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1884     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1885 }\
1886 \
1887 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1888     uint8_t full[24*17];\
1889     uint8_t half[256];\
1890     copy_block17(full, src, 24, stride, 17);\
1891     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1892     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1893 }\
1894 \
1895 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1896     uint8_t full[24*17];\
1897     copy_block17(full, src, 24, stride, 17);\
1898     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1899 }\
1900 \
1901 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1902     uint8_t full[24*17];\
1903     uint8_t half[256];\
1904     copy_block17(full, src, 24, stride, 17);\
1905     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1906     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1907 }\
1908 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1909     uint8_t full[24*17];\
1910     uint8_t halfH[272];\
1911     uint8_t halfV[256];\
1912     uint8_t halfHV[256];\
1913     copy_block17(full, src, 24, stride, 17);\
1914     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1915     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1916     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1917     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1918 }\
1919 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1920     uint8_t full[24*17];\
1921     uint8_t halfH[272];\
1922     uint8_t halfHV[256];\
1923     copy_block17(full, src, 24, stride, 17);\
1924     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1925     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1926     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1927     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1928 }\
1929 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1930     uint8_t full[24*17];\
1931     uint8_t halfH[272];\
1932     uint8_t halfV[256];\
1933     uint8_t halfHV[256];\
1934     copy_block17(full, src, 24, stride, 17);\
1935     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1936     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1937     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1938     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1939 }\
1940 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1941     uint8_t full[24*17];\
1942     uint8_t halfH[272];\
1943     uint8_t halfHV[256];\
1944     copy_block17(full, src, 24, stride, 17);\
1945     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1946     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1947     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1948     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1949 }\
1950 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1951     uint8_t full[24*17];\
1952     uint8_t halfH[272];\
1953     uint8_t halfV[256];\
1954     uint8_t halfHV[256];\
1955     copy_block17(full, src, 24, stride, 17);\
1956     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1957     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1958     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1959     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1960 }\
1961 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1962     uint8_t full[24*17];\
1963     uint8_t halfH[272];\
1964     uint8_t halfHV[256];\
1965     copy_block17(full, src, 24, stride, 17);\
1966     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1967     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1968     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1969     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1970 }\
1971 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1972     uint8_t full[24*17];\
1973     uint8_t halfH[272];\
1974     uint8_t halfV[256];\
1975     uint8_t halfHV[256];\
1976     copy_block17(full, src, 24, stride, 17);\
1977     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1978     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1979     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1980     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1981 }\
1982 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1983     uint8_t full[24*17];\
1984     uint8_t halfH[272];\
1985     uint8_t halfHV[256];\
1986     copy_block17(full, src, 24, stride, 17);\
1987     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1988     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1989     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1990     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1991 }\
1992 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1993     uint8_t halfH[272];\
1994     uint8_t halfHV[256];\
1995     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1996     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1997     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1998 }\
1999 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2000     uint8_t halfH[272];\
2001     uint8_t halfHV[256];\
2002     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2003     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2004     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2005 }\
2006 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2007     uint8_t full[24*17];\
2008     uint8_t halfH[272];\
2009     uint8_t halfV[256];\
2010     uint8_t halfHV[256];\
2011     copy_block17(full, src, 24, stride, 17);\
2012     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2013     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2014     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2015     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2016 }\
2017 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2018     uint8_t full[24*17];\
2019     uint8_t halfH[272];\
2020     copy_block17(full, src, 24, stride, 17);\
2021     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2022     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2023     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2024 }\
2025 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2026     uint8_t full[24*17];\
2027     uint8_t halfH[272];\
2028     uint8_t halfV[256];\
2029     uint8_t halfHV[256];\
2030     copy_block17(full, src, 24, stride, 17);\
2031     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2032     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2033     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2034     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2035 }\
2036 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2037     uint8_t full[24*17];\
2038     uint8_t halfH[272];\
2039     copy_block17(full, src, 24, stride, 17);\
2040     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2041     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2042     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2043 }\
2044 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2045     uint8_t halfH[272];\
2046     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2047     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2048 }
2049
2050 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2051 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2052 #define op_put(a, b) a = cm[((b) + 16)>>5]
2053 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2054
2055 QPEL_MC(0, put_       , _       , op_put)
2056 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2057 QPEL_MC(0, avg_       , _       , op_avg)
2058 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2059 #undef op_avg
2060 #undef op_avg_no_rnd
2061 #undef op_put
2062 #undef op_put_no_rnd
2063
2064 #if 1
2065 #define H264_LOWPASS(OPNAME, OP, OP2) \
2066 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2067     const int h=2;\
2068     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2069     int i;\
2070     for(i=0; i<h; i++)\
2071     {\
2072         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2073         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2074         dst+=dstStride;\
2075         src+=srcStride;\
2076     }\
2077 }\
2078 \
2079 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2080     const int w=2;\
2081     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2082     int i;\
2083     for(i=0; i<w; i++)\
2084     {\
2085         const int srcB= src[-2*srcStride];\
2086         const int srcA= src[-1*srcStride];\
2087         const int src0= src[0 *srcStride];\
2088         const int src1= src[1 *srcStride];\
2089         const int src2= src[2 *srcStride];\
2090         const int src3= src[3 *srcStride];\
2091         const int src4= src[4 *srcStride];\
2092         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2093         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2094         dst++;\
2095         src++;\
2096     }\
2097 }\
2098 \
2099 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2100     const int h=2;\
2101     const int w=2;\
2102     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2103     int i;\
2104     src -= 2*srcStride;\
2105     for(i=0; i<h+5; i++)\
2106     {\
2107         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2108         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2109         tmp+=tmpStride;\
2110         src+=srcStride;\
2111     }\
2112     tmp -= tmpStride*(h+5-2);\
2113     for(i=0; i<w; i++)\
2114     {\
2115         const int tmpB= tmp[-2*tmpStride];\
2116         const int tmpA= tmp[-1*tmpStride];\
2117         const int tmp0= tmp[0 *tmpStride];\
2118         const int tmp1= tmp[1 *tmpStride];\
2119         const int tmp2= tmp[2 *tmpStride];\
2120         const int tmp3= tmp[3 *tmpStride];\
2121         const int tmp4= tmp[4 *tmpStride];\
2122         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2123         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2124         dst++;\
2125         tmp++;\
2126     }\
2127 }\
2128 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2129     const int h=4;\
2130     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2131     int i;\
2132     for(i=0; i<h; i++)\
2133     {\
2134         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2135         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2136         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2137         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2138         dst+=dstStride;\
2139         src+=srcStride;\
2140     }\
2141 }\
2142 \
2143 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2144     const int w=4;\
2145     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2146     int i;\
2147     for(i=0; i<w; i++)\
2148     {\
2149         const int srcB= src[-2*srcStride];\
2150         const int srcA= src[-1*srcStride];\
2151         const int src0= src[0 *srcStride];\
2152         const int src1= src[1 *srcStride];\
2153         const int src2= src[2 *srcStride];\
2154         const int src3= src[3 *srcStride];\
2155         const int src4= src[4 *srcStride];\
2156         const int src5= src[5 *srcStride];\
2157         const int src6= src[6 *srcStride];\
2158         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2159         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2160         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2161         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2162         dst++;\
2163         src++;\
2164     }\
2165 }\
2166 \
2167 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2168     const int h=4;\
2169     const int w=4;\
2170     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2171     int i;\
2172     src -= 2*srcStride;\
2173     for(i=0; i<h+5; i++)\
2174     {\
2175         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2176         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2177         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2178         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2179         tmp+=tmpStride;\
2180         src+=srcStride;\
2181     }\
2182     tmp -= tmpStride*(h+5-2);\
2183     for(i=0; i<w; i++)\
2184     {\
2185         const int tmpB= tmp[-2*tmpStride];\
2186         const int tmpA= tmp[-1*tmpStride];\
2187         const int tmp0= tmp[0 *tmpStride];\
2188         const int tmp1= tmp[1 *tmpStride];\
2189         const int tmp2= tmp[2 *tmpStride];\
2190         const int tmp3= tmp[3 *tmpStride];\
2191         const int tmp4= tmp[4 *tmpStride];\
2192         const int tmp5= tmp[5 *tmpStride];\
2193         const int tmp6= tmp[6 *tmpStride];\
2194         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2195         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2196         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2197         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2198         dst++;\
2199         tmp++;\
2200     }\
2201 }\
2202 \
2203 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2204     const int h=8;\
2205     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2206     int i;\
2207     for(i=0; i<h; i++)\
2208     {\
2209         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2210         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2211         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2212         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2213         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2214         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2215         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2216         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2217         dst+=dstStride;\
2218         src+=srcStride;\
2219     }\
2220 }\
2221 \
2222 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2223     const int w=8;\
2224     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2225     int i;\
2226     for(i=0; i<w; i++)\
2227     {\
2228         const int srcB= src[-2*srcStride];\
2229         const int srcA= src[-1*srcStride];\
2230         const int src0= src[0 *srcStride];\
2231         const int src1= src[1 *srcStride];\
2232         const int src2= src[2 *srcStride];\
2233         const int src3= src[3 *srcStride];\
2234         const int src4= src[4 *srcStride];\
2235         const int src5= src[5 *srcStride];\
2236         const int src6= src[6 *srcStride];\
2237         const int src7= src[7 *srcStride];\
2238         const int src8= src[8 *srcStride];\
2239         const int src9= src[9 *srcStride];\
2240         const int src10=src[10*srcStride];\
2241         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2242         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2243         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2244         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2245         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2246         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2247         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2248         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2249         dst++;\
2250         src++;\
2251     }\
2252 }\
2253 \
2254 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2255     const int h=8;\
2256     const int w=8;\
2257     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2258     int i;\
2259     src -= 2*srcStride;\
2260     for(i=0; i<h+5; i++)\
2261     {\
2262         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2263         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2264         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2265         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2266         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2267         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2268         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2269         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2270         tmp+=tmpStride;\
2271         src+=srcStride;\
2272     }\
2273     tmp -= tmpStride*(h+5-2);\
2274     for(i=0; i<w; i++)\
2275     {\
2276         const int tmpB= tmp[-2*tmpStride];\
2277         const int tmpA= tmp[-1*tmpStride];\
2278         const int tmp0= tmp[0 *tmpStride];\
2279         const int tmp1= tmp[1 *tmpStride];\
2280         const int tmp2= tmp[2 *tmpStride];\
2281         const int tmp3= tmp[3 *tmpStride];\
2282         const int tmp4= tmp[4 *tmpStride];\
2283         const int tmp5= tmp[5 *tmpStride];\
2284         const int tmp6= tmp[6 *tmpStride];\
2285         const int tmp7= tmp[7 *tmpStride];\
2286         const int tmp8= tmp[8 *tmpStride];\
2287         const int tmp9= tmp[9 *tmpStride];\
2288         const int tmp10=tmp[10*tmpStride];\
2289         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2290         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2291         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2292         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2293         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2294         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2295         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2296         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2297         dst++;\
2298         tmp++;\
2299     }\
2300 }\
2301 \
2302 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2303     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2304     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2305     src += 8*srcStride;\
2306     dst += 8*dstStride;\
2307     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2308     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2309 }\
2310 \
2311 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2312     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2313     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2314     src += 8*srcStride;\
2315     dst += 8*dstStride;\
2316     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2317     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2318 }\
2319 \
2320 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2321     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2322     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2323     src += 8*srcStride;\
2324     dst += 8*dstStride;\
2325     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2326     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2327 }\
2328
2329 #define H264_MC(OPNAME, SIZE) \
2330 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2331     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2332 }\
2333 \
2334 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2335     uint8_t half[SIZE*SIZE];\
2336     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2337     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2338 }\
2339 \
2340 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2341     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2342 }\
2343 \
2344 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2345     uint8_t half[SIZE*SIZE];\
2346     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2347     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2348 }\
2349 \
2350 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2351     uint8_t full[SIZE*(SIZE+5)];\
2352     uint8_t * const full_mid= full + SIZE*2;\
2353     uint8_t half[SIZE*SIZE];\
2354     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2355     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2356     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2357 }\
2358 \
2359 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2360     uint8_t full[SIZE*(SIZE+5)];\
2361     uint8_t * const full_mid= full + SIZE*2;\
2362     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2363     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2364 }\
2365 \
2366 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2367     uint8_t full[SIZE*(SIZE+5)];\
2368     uint8_t * const full_mid= full + SIZE*2;\
2369     uint8_t half[SIZE*SIZE];\
2370     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2371     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2372     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2373 }\
2374 \
2375 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2376     uint8_t full[SIZE*(SIZE+5)];\
2377     uint8_t * const full_mid= full + SIZE*2;\
2378     uint8_t halfH[SIZE*SIZE];\
2379     uint8_t halfV[SIZE*SIZE];\
2380     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2381     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2382     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2383     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2384 }\
2385 \
2386 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2387     uint8_t full[SIZE*(SIZE+5)];\
2388     uint8_t * const full_mid= full + SIZE*2;\
2389     uint8_t halfH[SIZE*SIZE];\
2390     uint8_t halfV[SIZE*SIZE];\
2391     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2392     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2393     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2394     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2395 }\
2396 \
2397 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2398     uint8_t full[SIZE*(SIZE+5)];\
2399     uint8_t * const full_mid= full + SIZE*2;\
2400     uint8_t halfH[SIZE*SIZE];\
2401     uint8_t halfV[SIZE*SIZE];\
2402     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2403     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2404     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2405     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2406 }\
2407 \
2408 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2409     uint8_t full[SIZE*(SIZE+5)];\
2410     uint8_t * const full_mid= full + SIZE*2;\
2411     uint8_t halfH[SIZE*SIZE];\
2412     uint8_t halfV[SIZE*SIZE];\
2413     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2414     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2415     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2416     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2417 }\
2418 \
2419 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2420     int16_t tmp[SIZE*(SIZE+5)];\
2421     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2422 }\
2423 \
2424 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2425     int16_t tmp[SIZE*(SIZE+5)];\
2426     uint8_t halfH[SIZE*SIZE];\
2427     uint8_t halfHV[SIZE*SIZE];\
2428     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2429     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2430     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2431 }\
2432 \
2433 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2434     int16_t tmp[SIZE*(SIZE+5)];\
2435     uint8_t halfH[SIZE*SIZE];\
2436     uint8_t halfHV[SIZE*SIZE];\
2437     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2438     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2439     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2440 }\
2441 \
2442 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2443     uint8_t full[SIZE*(SIZE+5)];\
2444     uint8_t * const full_mid= full + SIZE*2;\
2445     int16_t tmp[SIZE*(SIZE+5)];\
2446     uint8_t halfV[SIZE*SIZE];\
2447     uint8_t halfHV[SIZE*SIZE];\
2448     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2449     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2450     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2451     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2452 }\
2453 \
2454 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2455     uint8_t full[SIZE*(SIZE+5)];\
2456     uint8_t * const full_mid= full + SIZE*2;\
2457     int16_t tmp[SIZE*(SIZE+5)];\
2458     uint8_t halfV[SIZE*SIZE];\
2459     uint8_t halfHV[SIZE*SIZE];\
2460     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2461     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2462     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2463     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2464 }\
2465
2466 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2467 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2468 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2469 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2470 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2471
2472 H264_LOWPASS(put_       , op_put, op2_put)
2473 H264_LOWPASS(avg_       , op_avg, op2_avg)
2474 H264_MC(put_, 2)
2475 H264_MC(put_, 4)
2476 H264_MC(put_, 8)
2477 H264_MC(put_, 16)
2478 H264_MC(avg_, 4)
2479 H264_MC(avg_, 8)
2480 H264_MC(avg_, 16)
2481
2482 #undef op_avg
2483 #undef op_put
2484 #undef op2_avg
2485 #undef op2_put
2486 #endif
2487
2488 #define op_scale1(x)  block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2489 #define op_scale2(x)  dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2490 #define H264_WEIGHT(W,H) \
2491 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2492     int y; \
2493     offset <<= log2_denom; \
2494     if(log2_denom) offset += 1<<(log2_denom-1); \
2495     for(y=0; y<H; y++, block += stride){ \
2496         op_scale1(0); \
2497         op_scale1(1); \
2498         if(W==2) continue; \
2499         op_scale1(2); \
2500         op_scale1(3); \
2501         if(W==4) continue; \
2502         op_scale1(4); \
2503         op_scale1(5); \
2504         op_scale1(6); \
2505         op_scale1(7); \
2506         if(W==8) continue; \
2507         op_scale1(8); \
2508         op_scale1(9); \
2509         op_scale1(10); \
2510         op_scale1(11); \
2511         op_scale1(12); \
2512         op_scale1(13); \
2513         op_scale1(14); \
2514         op_scale1(15); \
2515     } \
2516 } \
2517 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2518     int y; \
2519     offset = ((offset + 1) | 1) << log2_denom; \
2520     for(y=0; y<H; y++, dst += stride, src += stride){ \
2521         op_scale2(0); \
2522         op_scale2(1); \
2523         if(W==2) continue; \
2524         op_scale2(2); \
2525         op_scale2(3); \
2526         if(W==4) continue; \
2527         op_scale2(4); \
2528         op_scale2(5); \
2529         op_scale2(6); \
2530         op_scale2(7); \
2531         if(W==8) continue; \
2532         op_scale2(8); \
2533         op_scale2(9); \
2534         op_scale2(10); \
2535         op_scale2(11); \
2536         op_scale2(12); \
2537         op_scale2(13); \
2538         op_scale2(14); \
2539         op_scale2(15); \
2540     } \
2541 }
2542
2543 H264_WEIGHT(16,16)
2544 H264_WEIGHT(16,8)
2545 H264_WEIGHT(8,16)
2546 H264_WEIGHT(8,8)
2547 H264_WEIGHT(8,4)
2548 H264_WEIGHT(4,8)
2549 H264_WEIGHT(4,4)
2550 H264_WEIGHT(4,2)
2551 H264_WEIGHT(2,4)
2552 H264_WEIGHT(2,2)
2553
2554 #undef op_scale1
2555 #undef op_scale2
2556 #undef H264_WEIGHT
2557
2558 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2559     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2560     int i;
2561
2562     for(i=0; i<h; i++){
2563         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2564         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2565         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2566         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2567         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2568         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2569         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2570         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2571         dst+=dstStride;
2572         src+=srcStride;
2573     }
2574 }
2575
2576 #ifdef CONFIG_CAVS_DECODER
2577 /* AVS specific */
2578 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2579
2580 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2581     put_pixels8_c(dst, src, stride, 8);
2582 }
2583 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2584     avg_pixels8_c(dst, src, stride, 8);
2585 }
2586 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2587     put_pixels16_c(dst, src, stride, 16);
2588 }
2589 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2590     avg_pixels16_c(dst, src, stride, 16);
2591 }
2592 #endif /* CONFIG_CAVS_DECODER */
2593
2594 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2595 /* VC-1 specific */
2596 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2597
2598 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2599     put_pixels8_c(dst, src, stride, 8);
2600 }
2601 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2602
2603 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2604     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2605     int i;
2606
2607     for(i=0; i<w; i++){
2608         const int src_1= src[ -srcStride];
2609         const int src0 = src[0          ];
2610         const int src1 = src[  srcStride];
2611         const int src2 = src[2*srcStride];
2612         const int src3 = src[3*srcStride];
2613         const int src4 = src[4*srcStride];
2614         const int src5 = src[5*srcStride];
2615         const int src6 = src[6*srcStride];
2616         const int src7 = src[7*srcStride];
2617         const int src8 = src[8*srcStride];
2618         const int src9 = src[9*srcStride];
2619         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2620         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2621         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2622         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2623         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2624         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2625         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2626         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2627         src++;
2628         dst++;
2629     }
2630 }
2631
2632 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2633     put_pixels8_c(dst, src, stride, 8);
2634 }
2635
2636 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2637     uint8_t half[64];
2638     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2639     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2640 }
2641
2642 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2643     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2644 }
2645
2646 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2647     uint8_t half[64];
2648     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2649     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2650 }
2651
2652 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2653     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2654 }
2655
2656 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2657     uint8_t halfH[88];
2658     uint8_t halfV[64];
2659     uint8_t halfHV[64];
2660     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2661     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2662     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2663     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2664 }
2665 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2666     uint8_t halfH[88];
2667     uint8_t halfV[64];
2668     uint8_t halfHV[64];
2669     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2670     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2671     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2672     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2673 }
2674 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2675     uint8_t halfH[88];
2676     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2677     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2678 }
2679
2680 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2681     int x;
2682     const int strength= ff_h263_loop_filter_strength[qscale];
2683
2684     for(x=0; x<8; x++){
2685         int d1, d2, ad1;
2686         int p0= src[x-2*stride];
2687         int p1= src[x-1*stride];
2688         int p2= src[x+0*stride];
2689         int p3= src[x+1*stride];
2690         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2691
2692         if     (d<-2*strength) d1= 0;
2693         else if(d<-  strength) d1=-2*strength - d;
2694         else if(d<   strength) d1= d;
2695         else if(d< 2*strength) d1= 2*strength - d;
2696         else                   d1= 0;
2697
2698         p1 += d1;
2699         p2 -= d1;
2700         if(p1&256) p1= ~(p1>>31);
2701         if(p2&256) p2= ~(p2>>31);
2702
2703         src[x-1*stride] = p1;
2704         src[x+0*stride] = p2;
2705
2706         ad1= ABS(d1)>>1;
2707
2708         d2= clip((p0-p3)/4, -ad1, ad1);
2709
2710         src[x-2*stride] = p0 - d2;
2711         src[x+  stride] = p3 + d2;
2712     }
2713 }
2714
2715 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2716     int y;
2717     const int strength= ff_h263_loop_filter_strength[qscale];
2718
2719     for(y=0; y<8; y++){
2720         int d1, d2, ad1;
2721         int p0= src[y*stride-2];
2722         int p1= src[y*stride-1];
2723         int p2= src[y*stride+0];
2724         int p3= src[y*stride+1];
2725         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2726
2727         if     (d<-2*strength) d1= 0;
2728         else if(d<-  strength) d1=-2*strength - d;
2729         else if(d<   strength) d1= d;
2730         else if(d< 2*strength) d1= 2*strength - d;
2731         else                   d1= 0;
2732
2733         p1 += d1;
2734         p2 -= d1;
2735         if(p1&256) p1= ~(p1>>31);
2736         if(p2&256) p2= ~(p2>>31);
2737
2738         src[y*stride-1] = p1;
2739         src[y*stride+0] = p2;
2740
2741         ad1= ABS(d1)>>1;
2742
2743         d2= clip((p0-p3)/4, -ad1, ad1);
2744
2745         src[y*stride-2] = p0 - d2;
2746         src[y*stride+1] = p3 + d2;
2747     }
2748 }
2749
2750 static void h261_loop_filter_c(uint8_t *src, int stride){
2751     int x,y,xy,yz;
2752     int temp[64];
2753
2754     for(x=0; x<8; x++){
2755         temp[x      ] = 4*src[x           ];
2756         temp[x + 7*8] = 4*src[x + 7*stride];
2757     }
2758     for(y=1; y<7; y++){
2759         for(x=0; x<8; x++){
2760             xy = y * stride + x;
2761             yz = y * 8 + x;
2762             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2763         }
2764     }
2765
2766     for(y=0; y<8; y++){
2767         src[  y*stride] = (temp[  y*8] + 2)>>2;
2768         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2769         for(x=1; x<7; x++){
2770             xy = y * stride + x;
2771             yz = y * 8 + x;
2772             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2773         }
2774     }
2775 }
2776
2777 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2778 {
2779     int i, d;
2780     for( i = 0; i < 4; i++ ) {
2781         if( tc0[i] < 0 ) {
2782             pix += 4*ystride;
2783             continue;
2784         }
2785         for( d = 0; d < 4; d++ ) {
2786             const int p0 = pix[-1*xstride];
2787             const int p1 = pix[-2*xstride];
2788             const int p2 = pix[-3*xstride];
2789             const int q0 = pix[0];
2790             const int q1 = pix[1*xstride];
2791             const int q2 = pix[2*xstride];
2792
2793             if( ABS( p0 - q0 ) < alpha &&
2794                 ABS( p1 - p0 ) < beta &&
2795                 ABS( q1 - q0 ) < beta ) {
2796
2797                 int tc = tc0[i];
2798                 int i_delta;
2799
2800                 if( ABS( p2 - p0 ) < beta ) {
2801                     pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2802                     tc++;
2803                 }
2804                 if( ABS( q2 - q0 ) < beta ) {
2805                     pix[   xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2806                     tc++;
2807                 }
2808
2809                 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2810                 pix[-xstride] = clip_uint8( p0 + i_delta );    /* p0' */
2811                 pix[0]        = clip_uint8( q0 - i_delta );    /* q0' */
2812             }
2813             pix += ystride;
2814         }
2815     }
2816 }
2817 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2818 {
2819     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2820 }
2821 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2822 {
2823     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2824 }
2825
2826 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2827 {
2828     int i, d;
2829     for( i = 0; i < 4; i++ ) {
2830         const int tc = tc0[i];
2831         if( tc <= 0 ) {
2832             pix += 2*ystride;
2833             continue;
2834         }
2835         for( d = 0; d < 2; d++ ) {
2836             const int p0 = pix[-1*xstride];
2837             const int p1 = pix[-2*xstride];
2838             const int q0 = pix[0];
2839             const int q1 = pix[1*xstride];
2840
2841             if( ABS( p0 - q0 ) < alpha &&
2842                 ABS( p1 - p0 ) < beta &&
2843                 ABS( q1 - q0 ) < beta ) {
2844
2845                 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2846
2847                 pix[-xstride] = clip_uint8( p0 + delta );    /* p0' */
2848                 pix[0]        = clip_uint8( q0 - delta );    /* q0' */
2849             }
2850             pix += ystride;
2851         }
2852     }
2853 }
2854 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2855 {
2856     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2857 }
2858 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2859 {
2860     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2861 }
2862
2863 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2864 {
2865     int d;
2866     for( d = 0; d < 8; d++ ) {
2867         const int p0 = pix[-1*xstride];
2868         const int p1 = pix[-2*xstride];
2869         const int q0 = pix[0];
2870         const int q1 = pix[1*xstride];
2871
2872         if( ABS( p0 - q0 ) < alpha &&
2873             ABS( p1 - p0 ) < beta &&
2874             ABS( q1 - q0 ) < beta ) {
2875
2876             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2877             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2878         }
2879         pix += ystride;
2880     }
2881 }
2882 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2883 {
2884     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2885 }
2886 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2887 {
2888     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2889 }
2890
2891 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2892 {
2893     int s, i;
2894
2895     s = 0;
2896     for(i=0;i<h;i++) {
2897         s += abs(pix1[0] - pix2[0]);
2898         s += abs(pix1[1] - pix2[1]);
2899         s += abs(pix1[2] - pix2[2]);
2900         s += abs(pix1[3] - pix2[3]);
2901         s += abs(pix1[4] - pix2[4]);
2902         s += abs(pix1[5] - pix2[5]);
2903         s += abs(pix1[6] - pix2[6]);
2904         s += abs(pix1[7] - pix2[7]);
2905         s += abs(pix1[8] - pix2[8]);
2906         s += abs(pix1[9] - pix2[9]);
2907         s += abs(pix1[10] - pix2[10]);
2908         s += abs(pix1[11] - pix2[11]);
2909         s += abs(pix1[12] - pix2[12]);
2910         s += abs(pix1[13] - pix2[13]);
2911         s += abs(pix1[14] - pix2[14]);
2912         s += abs(pix1[15] - pix2[15]);
2913         pix1 += line_size;
2914         pix2 += line_size;
2915     }
2916     return s;
2917 }
2918
2919 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2920 {
2921     int s, i;
2922
2923     s = 0;
2924     for(i=0;i<h;i++) {
2925         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2926         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2927         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2928         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2929         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2930         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2931         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2932         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2933         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2934         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2935         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2936         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2937         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2938         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2939         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2940         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2941         pix1 += line_size;
2942         pix2 += line_size;
2943     }
2944     return s;
2945 }
2946
2947 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2948 {
2949     int s, i;
2950     uint8_t *pix3 = pix2 + line_size;
2951
2952     s = 0;
2953     for(i=0;i<h;i++) {
2954         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2955         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2956         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2957         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2958         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2959         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2960         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2961         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2962         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2963         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2964         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2965         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2966         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2967         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2968         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2969         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2970         pix1 += line_size;
2971         pix2 += line_size;
2972         pix3 += line_size;
2973     }
2974     return s;
2975 }
2976
2977 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2978 {
2979     int s, i;
2980     uint8_t *pix3 = pix2 + line_size;
2981
2982     s = 0;
2983     for(i=0;i<h;i++) {
2984         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2985         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2986         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2987         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2988         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2989         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2990         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2991         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2992         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2993         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2994         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2995         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2996         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2997         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2998         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2999         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3000         pix1 += line_size;
3001         pix2 += line_size;
3002         pix3 += line_size;
3003     }
3004     return s;
3005 }
3006
3007 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3008 {
3009     int s, i;
3010
3011     s = 0;
3012     for(i=0;i<h;i++) {
3013         s += abs(pix1[0] - pix2[0]);
3014         s += abs(pix1[1] - pix2[1]);
3015         s += abs(pix1[2] - pix2[2]);
3016         s += abs(pix1[3] - pix2[3]);
3017         s += abs(pix1[4] - pix2[4]);
3018         s += abs(pix1[5] - pix2[5]);
3019         s += abs(pix1[6] - pix2[6]);
3020         s += abs(pix1[7] - pix2[7]);
3021         pix1 += line_size;
3022         pix2 += line_size;
3023     }
3024     return s;
3025 }
3026
3027 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3028 {
3029     int s, i;
3030
3031     s = 0;
3032     for(i=0;i<h;i++) {
3033         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3034         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3035         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3036         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3037         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3038         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3039         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3040         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3041         pix1 += line_size;
3042         pix2 += line_size;
3043     }
3044     return s;
3045 }
3046
3047 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3048 {
3049     int s, i;
3050     uint8_t *pix3 = pix2 + line_size;
3051
3052     s = 0;
3053     for(i=0;i<h;i++) {
3054         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3055         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3056         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3057         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3058         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3059         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3060         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3061         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3062         pix1 += line_size;
3063         pix2 += line_size;
3064         pix3 += line_size;
3065     }
3066     return s;
3067 }
3068
3069 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3070 {
3071     int s, i;
3072     uint8_t *pix3 = pix2 + line_size;
3073
3074     s = 0;
3075     for(i=0;i<h;i++) {
3076         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3077         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3078         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3079         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3080         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3081         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3082         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3083         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3084         pix1 += line_size;
3085         pix2 += line_size;
3086         pix3 += line_size;
3087     }
3088     return s;
3089 }
3090
3091 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3092     MpegEncContext *c = v;
3093     int score1=0;
3094     int score2=0;
3095     int x,y;
3096
3097     for(y=0; y<h; y++){
3098         for(x=0; x<16; x++){
3099             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3100         }
3101         if(y+1<h){
3102             for(x=0; x<15; x++){
3103                 score2+= ABS(  s1[x  ] - s1[x  +stride]
3104                              - s1[x+1] + s1[x+1+stride])
3105                         -ABS(  s2[x  ] - s2[x  +stride]
3106                              - s2[x+1] + s2[x+1+stride]);
3107             }
3108         }
3109         s1+= stride;
3110         s2+= stride;
3111     }
3112
3113     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3114     else  return score1 + ABS(score2)*8;
3115 }
3116
3117 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3118     MpegEncContext *c = v;
3119     int score1=0;
3120     int score2=0;
3121     int x,y;
3122
3123     for(y=0; y<h; y++){
3124         for(x=0; x<8; x++){
3125             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3126         }
3127         if(y+1<h){
3128             for(x=0; x<7; x++){
3129                 score2+= ABS(  s1[x  ] - s1[x  +stride]
3130                              - s1[x+1] + s1[x+1+stride])
3131                         -ABS(  s2[x  ] - s2[x  +stride]
3132                              - s2[x+1] + s2[x+1+stride]);
3133             }
3134         }
3135         s1+= stride;
3136         s2+= stride;
3137     }
3138
3139     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3140     else  return score1 + ABS(score2)*8;
3141 }
3142
3143 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3144     int i;
3145     unsigned int sum=0;
3146
3147     for(i=0; i<8*8; i++){
3148         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3149         int w= weight[i];
3150         b>>= RECON_SHIFT;
3151         assert(-512<b && b<512);
3152
3153         sum += (w*b)*(w*b)>>4;
3154     }
3155     return sum>>2;
3156 }
3157
3158 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3159     int i;
3160
3161     for(i=0; i<8*8; i++){
3162         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3163     }
3164 }
3165
3166 /**
3167  * permutes an 8x8 block.
3168  * @param block the block which will be permuted according to the given permutation vector
3169  * @param permutation the permutation vector
3170  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3171  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3172  *                  (inverse) permutated to scantable order!
3173  */
3174 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3175 {
3176     int i;
3177     DCTELEM temp[64];
3178
3179     if(last<=0) return;
3180     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3181
3182     for(i=0; i<=last; i++){
3183         const int j= scantable[i];
3184         temp[j]= block[j];
3185         block[j]=0;
3186     }
3187
3188     for(i=0; i<=last; i++){
3189         const int j= scantable[i];
3190         const int perm_j= permutation[j];
3191         block[perm_j]= temp[j];
3192     }
3193 }
3194
3195 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3196     return 0;
3197 }
3198
3199 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3200     int i;
3201
3202     memset(cmp, 0, sizeof(void*)*5);
3203
3204     for(i=0; i<5; i++){
3205         switch(type&0xFF){
3206         case FF_CMP_SAD:
3207             cmp[i]= c->sad[i];
3208             break;
3209         case FF_CMP_SATD:
3210             cmp[i]= c->hadamard8_diff[i];
3211             break;
3212         case FF_CMP_SSE:
3213             cmp[i]= c->sse[i];
3214             break;
3215         case FF_CMP_DCT:
3216             cmp[i]= c->dct_sad[i];
3217             break;
3218         case FF_CMP_DCT264:
3219             cmp[i]= c->dct264_sad[i];
3220             break;
3221         case FF_CMP_DCTMAX:
3222             cmp[i]= c->dct_max[i];
3223             break;
3224         case FF_CMP_PSNR:
3225             cmp[i]= c->quant_psnr[i];
3226             break;
3227         case FF_CMP_BIT:
3228             cmp[i]= c->bit[i];
3229             break;
3230         case FF_CMP_RD:
3231             cmp[i]= c->rd[i];
3232             break;
3233         case FF_CMP_VSAD:
3234             cmp[i]= c->vsad[i];
3235             break;
3236         case FF_CMP_VSSE:
3237             cmp[i]= c->vsse[i];
3238             break;
3239         case FF_CMP_ZERO:
3240             cmp[i]= zero_cmp;
3241             break;
3242         case FF_CMP_NSSE:
3243             cmp[i]= c->nsse[i];
3244             break;
3245 #ifdef CONFIG_SNOW_ENCODER
3246         case FF_CMP_W53:
3247             cmp[i]= c->w53[i];
3248             break;
3249         case FF_CMP_W97:
3250             cmp[i]= c->w97[i];
3251             break;
3252 #endif
3253         default:
3254             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3255         }
3256     }
3257 }
3258
3259 /**
3260  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3261  */
3262 static void clear_blocks_c(DCTELEM *blocks)
3263 {
3264     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3265 }
3266
3267 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3268     int i;
3269     for(i=0; i+7<w; i+=8){
3270         dst[i+0] += src[i+0];
3271         dst[i+1] += src[i+1];
3272         dst[i+2] += src[i+2];
3273         dst[i+3] += src[i+3];
3274         dst[i+4] += src[i+4];
3275         dst[i+5] += src[i+5];
3276         dst[i+6] += src[i+6];
3277         dst[i+7] += src[i+7];
3278     }
3279     for(; i<w; i++)
3280         dst[i+0] += src[i+0];
3281 }
3282
3283 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3284     int i;
3285     for(i=0; i+7<w; i+=8){
3286         dst[i+0] = src1[i+0]-src2[i+0];
3287         dst[i+1] = src1[i+1]-src2[i+1];
3288         dst[i+2] = src1[i+2]-src2[i+2];
3289         dst[i+3] = src1[i+3]-src2[i+3];
3290         dst[i+4] = src1[i+4]-src2[i+4];
3291         dst[i+5] = src1[i+5]-src2[i+5];
3292         dst[i+6] = src1[i+6]-src2[i+6];
3293         dst[i+7] = src1[i+7]-src2[i+7];
3294     }
3295     for(; i<w; i++)
3296         dst[i+0] = src1[i+0]-src2[i+0];
3297 }
3298
3299 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3300     int i;
3301     uint8_t l, lt;
3302
3303     l= *left;
3304     lt= *left_top;
3305
3306     for(i=0; i<w; i++){
3307         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3308         lt= src1[i];
3309         l= src2[i];
3310         dst[i]= l - pred;
3311     }
3312
3313     *left= l;
3314     *left_top= lt;
3315 }
3316
3317 #define BUTTERFLY2(o1,o2,i1,i2) \
3318 o1= (i1)+(i2);\
3319 o2= (i1)-(i2);
3320
3321 #define BUTTERFLY1(x,y) \
3322 {\
3323     int a,b;\
3324     a= x;\
3325     b= y;\
3326     x= a+b;\
3327     y= a-b;\
3328 }
3329
3330 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3331
3332 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3333     int i;
3334     int temp[64];
3335     int sum=0;
3336
3337     assert(h==8);
3338
3339     for(i=0; i<8; i++){
3340         //FIXME try pointer walks
3341         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3342         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3343         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3344         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3345
3346         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3347         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3348         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3349         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3350
3351         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3352         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3353         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3354         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3355     }
3356
3357     for(i=0; i<8; i++){
3358         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3359         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3360         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3361         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3362
3363         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3364         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3365         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3366         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3367
3368         sum +=
3369              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3370             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3371             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3372             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3373     }
3374 #if 0
3375 static int maxi=0;
3376 if(sum>maxi){
3377     maxi=sum;
3378     printf("MAX:%d\n", maxi);
3379 }
3380 #endif
3381     return sum;
3382 }
3383
3384 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3385     int i;
3386     int temp[64];
3387     int sum=0;
3388
3389     assert(h==8);
3390
3391     for(i=0; i<8; i++){
3392         //FIXME try pointer walks
3393         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3394         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3395         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3396         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3397
3398         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3399         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3400         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3401         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3402
3403         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3404         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3405         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3406         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3407     }
3408
3409     for(i=0; i<8; i++){
3410         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3411         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3412         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3413         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3414
3415         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3416         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3417         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3418         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3419
3420         sum +=
3421              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3422             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3423             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3424             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3425     }
3426
3427     sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3428
3429     return sum;
3430 }
3431
3432 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3433     MpegEncContext * const s= (MpegEncContext *)c;
3434     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3435     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3436     int sum=0, i;
3437
3438     assert(h==8);
3439
3440     s->dsp.diff_pixels(temp, src1, src2, stride);
3441     s->dsp.fdct(temp);
3442
3443     for(i=0; i<64; i++)
3444         sum+= ABS(temp[i]);
3445
3446     return sum;
3447 }
3448
3449 #ifdef CONFIG_GPL
3450 #define DCT8_1D {\
3451     const int s07 = SRC(0) + SRC(7);\
3452     const int s16 = SRC(1) + SRC(6);\
3453     const int s25 = SRC(2) + SRC(5);\
3454     const int s34 = SRC(3) + SRC(4);\
3455     const int a0 = s07 + s34;\
3456     const int a1 = s16 + s25;\
3457     const int a2 = s07 - s34;\
3458     const int a3 = s16 - s25;\
3459     const int d07 = SRC(0) - SRC(7);\
3460     const int d16 = SRC(1) - SRC(6);\
3461     const int d25 = SRC(2) - SRC(5);\
3462     const int d34 = SRC(3) - SRC(4);\
3463     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3464     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3465     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3466     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3467     DST(0,  a0 + a1     ) ;\
3468     DST(1,  a4 + (a7>>2)) ;\
3469     DST(2,  a2 + (a3>>1)) ;\
3470     DST(3,  a5 + (a6>>2)) ;\
3471     DST(4,  a0 - a1     ) ;\
3472     DST(5,  a6 - (a5>>2)) ;\
3473     DST(6, (a2>>1) - a3 ) ;\
3474     DST(7, (a4>>2) - a7 ) ;\
3475 }
3476
3477 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3478     MpegEncContext * const s= (MpegEncContext *)c;
3479     int16_t dct[8][8];
3480     int i;
3481     int sum=0;
3482
3483     s->dsp.diff_pixels(dct, src1, src2, stride);
3484
3485 #define SRC(x) dct[i][x]
3486 #define DST(x,v) dct[i][x]= v
3487     for( i = 0; i < 8; i++ )
3488         DCT8_1D
3489 #undef SRC
3490 #undef DST
3491
3492 #define SRC(x) dct[x][i]
3493 #define DST(x,v) sum += ABS(v)
3494     for( i = 0; i < 8; i++ )
3495         DCT8_1D
3496 #undef SRC
3497 #undef DST
3498     return sum;
3499 }
3500 #endif
3501
3502 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3503     MpegEncContext * const s= (MpegEncContext *)c;
3504     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3505     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3506     int sum=0, i;
3507
3508     assert(h==8);
3509
3510     s->dsp.diff_pixels(temp, src1, src2, stride);
3511     s->dsp.fdct(temp);
3512
3513     for(i=0; i<64; i++)
3514         sum= FFMAX(sum, ABS(temp[i]));
3515
3516     return sum;
3517 }
3518
3519 void simple_idct(DCTELEM *block); //FIXME
3520
3521 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3522     MpegEncContext * const s= (MpegEncContext *)c;
3523     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3524     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3525     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3526     int sum=0, i;
3527
3528     assert(h==8);
3529     s->mb_intra=0;
3530
3531     s->dsp.diff_pixels(temp, src1, src2, stride);
3532
3533     memcpy(bak, temp, 64*sizeof(DCTELEM));
3534
3535     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3536     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3537     simple_idct(temp); //FIXME
3538
3539     for(i=0; i<64; i++)
3540         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3541
3542     return sum;
3543 }
3544
3545 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3546     MpegEncContext * const s= (MpegEncContext *)c;
3547     const uint8_t *scantable= s->intra_scantable.permutated;
3548     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3549     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3550     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3551     uint8_t * const bak= (uint8_t*)aligned_bak;
3552     int i, last, run, bits, level, distoration, start_i;
3553     const int esc_length= s->ac_esc_length;
3554     uint8_t * length;
3555     uint8_t * last_length;
3556
3557     assert(h==8);
3558
3559     for(i=0; i<8; i++){
3560         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3561         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3562     }
3563
3564     s->dsp.diff_pixels(temp, src1, src2, stride);
3565
3566     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3567
3568     bits=0;
3569
3570     if (s->mb_intra) {
3571         start_i = 1;
3572         length     = s->intra_ac_vlc_length;
3573         last_length= s->intra_ac_vlc_last_length;
3574         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3575     } else {
3576         start_i = 0;
3577         length     = s->inter_ac_vlc_length;
3578         last_length= s->inter_ac_vlc_last_length;
3579     }
3580
3581     if(last>=start_i){
3582         run=0;
3583         for(i=start_i; i<last; i++){
3584             int j= scantable[i];
3585             level= temp[j];
3586
3587             if(level){
3588                 level+=64;
3589                 if((level&(~127)) == 0){
3590                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3591                 }else
3592                     bits+= esc_length;
3593                 run=0;
3594             }else
3595                 run++;
3596         }
3597         i= scantable[last];
3598
3599         level= temp[i] + 64;
3600
3601         assert(level - 64);
3602
3603         if((level&(~127)) == 0){
3604             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3605         }else
3606             bits+= esc_length;
3607
3608     }
3609
3610     if(last>=0){
3611         if(s->mb_intra)
3612             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3613         else
3614             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3615     }
3616
3617     s->dsp.idct_add(bak, stride, temp);
3618
3619     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3620
3621     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3622 }
3623
3624 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3625     MpegEncContext * const s= (MpegEncContext *)c;
3626     const uint8_t *scantable= s->intra_scantable.permutated;
3627     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3628     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3629     int i, last, run, bits, level, start_i;
3630     const int esc_length= s->ac_esc_length;
3631     uint8_t * length;
3632     uint8_t * last_length;
3633
3634     assert(h==8);
3635
3636     s->dsp.diff_pixels(temp, src1, src2, stride);
3637
3638     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3639
3640     bits=0;
3641
3642     if (s->mb_intra) {
3643         start_i = 1;
3644         length     = s->intra_ac_vlc_length;
3645         last_length= s->intra_ac_vlc_last_length;
3646         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3647     } else {
3648         start_i = 0;
3649         length     = s->inter_ac_vlc_length;
3650         last_length= s->inter_ac_vlc_last_length;
3651     }
3652
3653     if(last>=start_i){
3654         run=0;
3655         for(i=start_i; i<last; i++){
3656             int j= scantable[i];
3657             level= temp[j];
3658
3659             if(level){
3660                 level+=64;
3661                 if((level&(~127)) == 0){
3662                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3663                 }else
3664                     bits+= esc_length;
3665                 run=0;
3666             }else
3667                 run++;
3668         }
3669         i= scantable[last];
3670
3671         level= temp[i] + 64;
3672
3673         assert(level - 64);
3674
3675         if((level&(~127)) == 0){
3676             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3677         }else
3678             bits+= esc_length;
3679     }
3680
3681     return bits;
3682 }
3683
3684 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3685     int score=0;
3686     int x,y;
3687
3688     for(y=1; y<h; y++){
3689         for(x=0; x<16; x+=4){
3690             score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride])
3691                    +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3692         }
3693         s+= stride;
3694     }
3695
3696     return score;
3697 }
3698
3699 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3700     int score=0;
3701     int x,y;
3702
3703     for(y=1; y<h; y++){
3704         for(x=0; x<16; x++){
3705             score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3706         }
3707         s1+= stride;
3708         s2+= stride;
3709     }
3710
3711     return score;
3712 }
3713
3714 #define SQ(a) ((a)*(a))
3715 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3716     int score=0;
3717     int x,y;
3718
3719     for(y=1; y<h; y++){
3720         for(x=0; x<16; x+=4){
3721             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3722                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3723         }
3724         s+= stride;
3725     }
3726
3727     return score;
3728 }
3729
3730 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3731     int score=0;
3732     int x,y;
3733
3734     for(y=1; y<h; y++){
3735         for(x=0; x<16; x++){
3736             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3737         }
3738         s1+= stride;
3739         s2+= stride;
3740     }
3741
3742     return score;
3743 }
3744
3745 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3746 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3747 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3748 #ifdef CONFIG_GPL
3749 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3750 #endif
3751 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3752 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3753 WARPER8_16_SQ(rd8x8_c, rd16_c)
3754 WARPER8_16_SQ(bit8x8_c, bit16_c)
3755
3756 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3757  converted */
3758 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3759 {
3760     j_rev_dct (block);
3761     put_pixels_clamped_c(block, dest, line_size);
3762 }
3763 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3764 {
3765     j_rev_dct (block);
3766     add_pixels_clamped_c(block, dest, line_size);
3767 }
3768
3769 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3770 {
3771     j_rev_dct4 (block);
3772     put_pixels_clamped4_c(block, dest, line_size);
3773 }
3774 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3775 {
3776     j_rev_dct4 (block);
3777     add_pixels_clamped4_c(block, dest, line_size);
3778 }
3779
3780 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3781 {
3782     j_rev_dct2 (block);
3783     put_pixels_clamped2_c(block, dest, line_size);
3784 }
3785 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3786 {
3787     j_rev_dct2 (block);
3788     add_pixels_clamped2_c(block, dest, line_size);
3789 }
3790
3791 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3792 {
3793     uint8_t *cm = cropTbl + MAX_NEG_CROP;
3794
3795     dest[0] = cm[(block[0] + 4)>>3];
3796 }
3797 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3798 {
3799     uint8_t *cm = cropTbl + MAX_NEG_CROP;
3800
3801     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3802 }
3803
3804 static void just_return() { return; }
3805
3806 /* init static data */
3807 void dsputil_static_init(void)
3808 {
3809     int i;
3810
3811     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3812     for(i=0;i<MAX_NEG_CROP;i++) {
3813         cropTbl[i] = 0;
3814         cropTbl[i + MAX_NEG_CROP + 256] = 255;
3815     }
3816
3817     for(i=0;i<512;i++) {
3818         squareTbl[i] = (i - 256) * (i - 256);
3819     }
3820
3821     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3822 }
3823
3824
3825 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3826 {
3827     int i;
3828
3829 #ifdef CONFIG_ENCODERS
3830     if(avctx->dct_algo==FF_DCT_FASTINT) {
3831         c->fdct = fdct_ifast;
3832         c->fdct248 = fdct_ifast248;
3833     }
3834     else if(avctx->dct_algo==FF_DCT_FAAN) {
3835         c->fdct = ff_faandct;
3836         c->fdct248 = ff_faandct248;
3837     }
3838     else {
3839         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3840         c->fdct248 = ff_fdct248_islow;
3841     }
3842 #endif //CONFIG_ENCODERS
3843
3844     if(avctx->lowres==1){
3845         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3846             c->idct_put= ff_jref_idct4_put;
3847             c->idct_add= ff_jref_idct4_add;
3848         }else{
3849             c->idct_put= ff_h264_lowres_idct_put_c;
3850             c->idct_add= ff_h264_lowres_idct_add_c;
3851         }
3852         c->idct    = j_rev_dct4;
3853         c->idct_permutation_type= FF_NO_IDCT_PERM;
3854     }else if(avctx->lowres==2){
3855         c->idct_put= ff_jref_idct2_put;
3856         c->idct_add= ff_jref_idct2_add;
3857         c->idct    = j_rev_dct2;
3858         c->idct_permutation_type= FF_NO_IDCT_PERM;
3859     }else if(avctx->lowres==3){
3860         c->idct_put= ff_jref_idct1_put;
3861         c->idct_add= ff_jref_idct1_add;
3862         c->idct    = j_rev_dct1;
3863         c->idct_permutation_type= FF_NO_IDCT_PERM;
3864     }else{
3865         if(avctx->idct_algo==FF_IDCT_INT){
3866             c->idct_put= ff_jref_idct_put;
3867             c->idct_add= ff_jref_idct_add;
3868             c->idct    = j_rev_dct;
3869             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3870         }else if(avctx->idct_algo==FF_IDCT_VP3){
3871             c->idct_put= ff_vp3_idct_put_c;
3872             c->idct_add= ff_vp3_idct_add_c;
3873             c->idct    = ff_vp3_idct_c;
3874             c->idct_permutation_type= FF_NO_IDCT_PERM;
3875         }else{ //accurate/default
3876             c->idct_put= simple_idct_put;
3877             c->idct_add= simple_idct_add;
3878             c->idct    = simple_idct;
3879             c->idct_permutation_type= FF_NO_IDCT_PERM;
3880         }
3881     }
3882
3883     c->h264_idct_add= ff_h264_idct_add_c;
3884     c->h264_idct8_add= ff_h264_idct8_add_c;
3885     c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3886     c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3887
3888     c->get_pixels = get_pixels_c;
3889     c->diff_pixels = diff_pixels_c;
3890     c->put_pixels_clamped = put_pixels_clamped_c;
3891     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3892     c->add_pixels_clamped = add_pixels_clamped_c;
3893     c->add_pixels8 = add_pixels8_c;
3894     c->add_pixels4 = add_pixels4_c;
3895     c->gmc1 = gmc1_c;
3896     c->gmc = ff_gmc_c;
3897     c->clear_blocks = clear_blocks_c;
3898     c->pix_sum = pix_sum_c;
3899     c->pix_norm1 = pix_norm1_c;
3900
3901     /* TODO [0] 16  [1] 8 */
3902     c->pix_abs[0][0] = pix_abs16_c;
3903     c->pix_abs[0][1] = pix_abs16_x2_c;
3904     c->pix_abs[0][2] = pix_abs16_y2_c;
3905     c->pix_abs[0][3] = pix_abs16_xy2_c;
3906     c->pix_abs[1][0] = pix_abs8_c;
3907     c->pix_abs[1][1] = pix_abs8_x2_c;
3908     c->pix_abs[1][2] = pix_abs8_y2_c;
3909     c->pix_abs[1][3] = pix_abs8_xy2_c;
3910
3911 #define dspfunc(PFX, IDX, NUM) \
3912     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3913     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3914     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3915     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3916
3917     dspfunc(put, 0, 16);
3918     dspfunc(put_no_rnd, 0, 16);
3919     dspfunc(put, 1, 8);
3920     dspfunc(put_no_rnd, 1, 8);
3921     dspfunc(put, 2, 4);
3922     dspfunc(put, 3, 2);
3923
3924     dspfunc(avg, 0, 16);
3925     dspfunc(avg_no_rnd, 0, 16);
3926     dspfunc(avg, 1, 8);
3927     dspfunc(avg_no_rnd, 1, 8);
3928     dspfunc(avg, 2, 4);
3929     dspfunc(avg, 3, 2);
3930 #undef dspfunc
3931
3932     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3933     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3934
3935     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3936     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3937     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3938     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3939     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3940     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3941     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3942     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3943     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3944
3945     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3946     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3947     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3948     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3949     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3950     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3951     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3952     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3953     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3954
3955 #define dspfunc(PFX, IDX, NUM) \
3956     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3957     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3958     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3959     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3960     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3961     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3962     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3963     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3964     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3965     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3966     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3967     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3968     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3969     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3970     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3971     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3972
3973     dspfunc(put_qpel, 0, 16);
3974     dspfunc(put_no_rnd_qpel, 0, 16);
3975
3976     dspfunc(avg_qpel, 0, 16);
3977     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3978
3979     dspfunc(put_qpel, 1, 8);
3980     dspfunc(put_no_rnd_qpel, 1, 8);
3981
3982     dspfunc(avg_qpel, 1, 8);
3983     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3984
3985     dspfunc(put_h264_qpel, 0, 16);
3986     dspfunc(put_h264_qpel, 1, 8);
3987     dspfunc(put_h264_qpel, 2, 4);
3988     dspfunc(put_h264_qpel, 3, 2);
3989     dspfunc(avg_h264_qpel, 0, 16);
3990     dspfunc(avg_h264_qpel, 1, 8);
3991     dspfunc(avg_h264_qpel, 2, 4);
3992
3993 #undef dspfunc
3994     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3995     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3996     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3997     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3998     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3999     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4000
4001     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4002     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4003     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4004     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4005     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4006     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4007     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4008     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4009     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4010     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4011     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4012     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4013     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4014     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4015     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4016     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4017     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4018     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4019     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4020     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4021
4022 #ifdef CONFIG_CAVS_DECODER
4023     ff_cavsdsp_init(c,avctx);
4024 #endif
4025 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4026     ff_vc1dsp_init(c,avctx);
4027 #endif
4028
4029     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4030     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4031     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4032     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4033     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4034     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4035     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4036     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4037
4038 #define SET_CMP_FUNC(name) \
4039     c->name[0]= name ## 16_c;\
4040     c->name[1]= name ## 8x8_c;
4041
4042     SET_CMP_FUNC(hadamard8_diff)
4043     c->hadamard8_diff[4]= hadamard8_intra16_c;
4044     SET_CMP_FUNC(dct_sad)
4045     SET_CMP_FUNC(dct_max)
4046 #ifdef CONFIG_GPL
4047     SET_CMP_FUNC(dct264_sad)
4048 #endif
4049     c->sad[0]= pix_abs16_c;
4050     c->sad[1]= pix_abs8_c;
4051     c->sse[0]= sse16_c;
4052     c->sse[1]= sse8_c;
4053     c->sse[2]= sse4_c;
4054     SET_CMP_FUNC(quant_psnr)
4055     SET_CMP_FUNC(rd)
4056     SET_CMP_FUNC(bit)
4057     c->vsad[0]= vsad16_c;
4058     c->vsad[4]= vsad_intra16_c;
4059     c->vsse[0]= vsse16_c;
4060     c->vsse[4]= vsse_intra16_c;
4061     c->nsse[0]= nsse16_c;
4062     c->nsse[1]= nsse8_c;
4063 #ifdef CONFIG_SNOW_ENCODER
4064     c->w53[0]= w53_16_c;
4065     c->w53[1]= w53_8_c;
4066     c->w97[0]= w97_16_c;
4067     c->w97[1]= w97_8_c;
4068 #endif
4069
4070     c->add_bytes= add_bytes_c;
4071     c->diff_bytes= diff_bytes_c;
4072     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4073     c->bswap_buf= bswap_buf;
4074
4075     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4076     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4077     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4078     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4079     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4080     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4081
4082     c->h263_h_loop_filter= h263_h_loop_filter_c;
4083     c->h263_v_loop_filter= h263_v_loop_filter_c;
4084
4085     c->h261_loop_filter= h261_loop_filter_c;
4086
4087     c->try_8x8basis= try_8x8basis_c;
4088     c->add_8x8basis= add_8x8basis_c;
4089
4090 #ifdef CONFIG_SNOW_ENCODER
4091     c->vertical_compose97i = ff_snow_vertical_compose97i;
4092     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4093     c->inner_add_yblock = ff_snow_inner_add_yblock;
4094 #endif
4095
4096 #ifdef CONFIG_VORBIS_DECODER
4097     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4098 #endif
4099
4100     c->shrink[0]= ff_img_copy_plane;
4101     c->shrink[1]= ff_shrink22;
4102     c->shrink[2]= ff_shrink44;
4103     c->shrink[3]= ff_shrink88;
4104
4105     c->prefetch= just_return;
4106
4107 #ifdef HAVE_MMX
4108     dsputil_init_mmx(c, avctx);
4109 #endif
4110 #ifdef ARCH_ARMV4L
4111     dsputil_init_armv4l(c, avctx);
4112 #endif
4113 #ifdef HAVE_MLIB
4114     dsputil_init_mlib(c, avctx);
4115 #endif
4116 #ifdef ARCH_SPARC
4117    dsputil_init_vis(c,avctx);
4118 #endif
4119 #ifdef ARCH_ALPHA
4120     dsputil_init_alpha(c, avctx);
4121 #endif
4122 #ifdef ARCH_POWERPC
4123     dsputil_init_ppc(c, avctx);
4124 #endif
4125 #ifdef HAVE_MMI
4126     dsputil_init_mmi(c, avctx);
4127 #endif
4128 #ifdef ARCH_SH4
4129     dsputil_init_sh4(c,avctx);
4130 #endif
4131
4132     switch(c->idct_permutation_type){
4133     case FF_NO_IDCT_PERM:
4134         for(i=0; i<64; i++)
4135             c->idct_permutation[i]= i;
4136         break;
4137     case FF_LIBMPEG2_IDCT_PERM:
4138         for(i=0; i<64; i++)
4139             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4140         break;
4141     case FF_SIMPLE_IDCT_PERM:
4142         for(i=0; i<64; i++)
4143             c->idct_permutation[i]= simple_mmx_permutation[i];
4144         break;
4145     case FF_TRANSPOSE_IDCT_PERM:
4146         for(i=0; i<64; i++)
4147             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4148         break;
4149     case FF_PARTTRANS_IDCT_PERM:
4150         for(i=0; i<64; i++)
4151             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4152         break;
4153     default:
4154         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4155     }
4156 }
4157