git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41 #include "diracdsp.h"
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 #define BIT_DEPTH 9
  47 #include "dsputil_template.c"
  48 #undef BIT_DEPTH
  49
  50 #define BIT_DEPTH 10
  51 #include "dsputil_template.c"
  52 #undef BIT_DEPTH
  53
  54 #define BIT_DEPTH 8
  55 #include "dsputil_template.c"
  56
  57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  58 #define pb_7f (~0UL/255 * 0x7f)
  59 #define pb_80 (~0UL/255 * 0x80)
  60
  61 const uint8_t ff_zigzag_direct[64] = {
  62     0,   1,  8, 16,  9,  2,  3, 10,
  63     17, 24, 32, 25, 18, 11,  4,  5,
  64     12, 19, 26, 33, 40, 48, 41, 34,
  65     27, 20, 13,  6,  7, 14, 21, 28,
  66     35, 42, 49, 56, 57, 50, 43, 36,
  67     29, 22, 15, 23, 30, 37, 44, 51,
  68     58, 59, 52, 45, 38, 31, 39, 46,
  69     53, 60, 61, 54, 47, 55, 62, 63
  70 };
  71
  72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  73    specification, we interleave the fields */
  74 const uint8_t ff_zigzag248_direct[64] = {
  75      0,  8,  1,  9, 16, 24,  2, 10,
  76     17, 25, 32, 40, 48, 56, 33, 41,
  77     18, 26,  3, 11,  4, 12, 19, 27,
  78     34, 42, 49, 57, 50, 58, 35, 43,
  79     20, 28,  5, 13,  6, 14, 21, 29,
  80     36, 44, 51, 59, 52, 60, 37, 45,
  81     22, 30,  7, 15, 23, 31, 38, 46,
  82     53, 61, 54, 62, 39, 47, 55, 63,
  83 };
  84
  85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  86 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
  87
  88 const uint8_t ff_alternate_horizontal_scan[64] = {
  89     0,  1,   2,  3,  8,  9, 16, 17,
  90     10, 11,  4,  5,  6,  7, 15, 14,
  91     13, 12, 19, 18, 24, 25, 32, 33,
  92     26, 27, 20, 21, 22, 23, 28, 29,
  93     30, 31, 34, 35, 40, 41, 48, 49,
  94     42, 43, 36, 37, 38, 39, 44, 45,
  95     46, 47, 50, 51, 56, 57, 58, 59,
  96     52, 53, 54, 55, 60, 61, 62, 63,
  97 };
  98
  99 const uint8_t ff_alternate_vertical_scan[64] = {
 100     0,  8,  16, 24,  1,  9,  2, 10,
 101     17, 25, 32, 40, 48, 56, 57, 49,
 102     41, 33, 26, 18,  3, 11,  4, 12,
 103     19, 27, 34, 42, 50, 58, 35, 43,
 104     51, 59, 20, 28,  5, 13,  6, 14,
 105     21, 29, 36, 44, 52, 60, 37, 45,
 106     53, 61, 22, 30,  7, 15, 23, 31,
 107     38, 46, 54, 62, 39, 47, 55, 63,
 108 };
 109
 110 /* Input permutation for the simple_idct_mmx */
 111 static const uint8_t simple_mmx_permutation[64]={
 112         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 113         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 114         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 115         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 116         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 117         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 118         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 119         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 120 };
 121
 122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 123
 124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 125     int i;
 126     int end;
 127
 128     st->scantable= src_scantable;
 129
 130     for(i=0; i<64; i++){
 131         int j;
 132         j = src_scantable[i];
 133         st->permutated[i] = permutation[j];
 134     }
 135
 136     end=-1;
 137     for(i=0; i<64; i++){
 138         int j;
 139         j = st->permutated[i];
 140         if(j>end) end=j;
 141         st->raster_end[i]= end;
 142     }
 143 }
 144
 145 void ff_init_scantable_permutation(uint8_t *idct_permutation,
 146                                    int idct_permutation_type)
 147 {
 148     int i;
 149
 150     switch(idct_permutation_type){
 151     case FF_NO_IDCT_PERM:
 152         for(i=0; i<64; i++)
 153             idct_permutation[i]= i;
 154         break;
 155     case FF_LIBMPEG2_IDCT_PERM:
 156         for(i=0; i<64; i++)
 157             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 158         break;
 159     case FF_SIMPLE_IDCT_PERM:
 160         for(i=0; i<64; i++)
 161             idct_permutation[i]= simple_mmx_permutation[i];
 162         break;
 163     case FF_TRANSPOSE_IDCT_PERM:
 164         for(i=0; i<64; i++)
 165             idct_permutation[i]= ((i&7)<<3) | (i>>3);
 166         break;
 167     case FF_PARTTRANS_IDCT_PERM:
 168         for(i=0; i<64; i++)
 169             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
 170         break;
 171     case FF_SSE2_IDCT_PERM:
 172         for(i=0; i<64; i++)
 173             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
 174         break;
 175     default:
 176         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
 177     }
 178 }
 179
 180 static int pix_sum_c(uint8_t * pix, int line_size)
 181 {
 182     int s, i, j;
 183
 184     s = 0;
 185     for (i = 0; i < 16; i++) {
 186         for (j = 0; j < 16; j += 8) {
 187             s += pix[0];
 188             s += pix[1];
 189             s += pix[2];
 190             s += pix[3];
 191             s += pix[4];
 192             s += pix[5];
 193             s += pix[6];
 194             s += pix[7];
 195             pix += 8;
 196         }
 197         pix += line_size - 16;
 198     }
 199     return s;
 200 }
 201
 202 static int pix_norm1_c(uint8_t * pix, int line_size)
 203 {
 204     int s, i, j;
 205     uint32_t *sq = ff_squareTbl + 256;
 206
 207     s = 0;
 208     for (i = 0; i < 16; i++) {
 209         for (j = 0; j < 16; j += 8) {
 210 #if 0
 211             s += sq[pix[0]];
 212             s += sq[pix[1]];
 213             s += sq[pix[2]];
 214             s += sq[pix[3]];
 215             s += sq[pix[4]];
 216             s += sq[pix[5]];
 217             s += sq[pix[6]];
 218             s += sq[pix[7]];
 219 #else
 220 #if HAVE_FAST_64BIT
 221             register uint64_t x=*(uint64_t*)pix;
 222             s += sq[x&0xff];
 223             s += sq[(x>>8)&0xff];
 224             s += sq[(x>>16)&0xff];
 225             s += sq[(x>>24)&0xff];
 226             s += sq[(x>>32)&0xff];
 227             s += sq[(x>>40)&0xff];
 228             s += sq[(x>>48)&0xff];
 229             s += sq[(x>>56)&0xff];
 230 #else
 231             register uint32_t x=*(uint32_t*)pix;
 232             s += sq[x&0xff];
 233             s += sq[(x>>8)&0xff];
 234             s += sq[(x>>16)&0xff];
 235             s += sq[(x>>24)&0xff];
 236             x=*(uint32_t*)(pix+4);
 237             s += sq[x&0xff];
 238             s += sq[(x>>8)&0xff];
 239             s += sq[(x>>16)&0xff];
 240             s += sq[(x>>24)&0xff];
 241 #endif
 242 #endif
 243             pix += 8;
 244         }
 245         pix += line_size - 16;
 246     }
 247     return s;
 248 }
 249
 250 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 251     int i;
 252
 253     for(i=0; i+8<=w; i+=8){
 254         dst[i+0]= av_bswap32(src[i+0]);
 255         dst[i+1]= av_bswap32(src[i+1]);
 256         dst[i+2]= av_bswap32(src[i+2]);
 257         dst[i+3]= av_bswap32(src[i+3]);
 258         dst[i+4]= av_bswap32(src[i+4]);
 259         dst[i+5]= av_bswap32(src[i+5]);
 260         dst[i+6]= av_bswap32(src[i+6]);
 261         dst[i+7]= av_bswap32(src[i+7]);
 262     }
 263     for(;i<w; i++){
 264         dst[i+0]= av_bswap32(src[i+0]);
 265     }
 266 }
 267
 268 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 269 {
 270     while (len--)
 271         *dst++ = av_bswap16(*src++);
 272 }
 273
 274 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 275 {
 276     int s, i;
 277     uint32_t *sq = ff_squareTbl + 256;
 278
 279     s = 0;
 280     for (i = 0; i < h; i++) {
 281         s += sq[pix1[0] - pix2[0]];
 282         s += sq[pix1[1] - pix2[1]];
 283         s += sq[pix1[2] - pix2[2]];
 284         s += sq[pix1[3] - pix2[3]];
 285         pix1 += line_size;
 286         pix2 += line_size;
 287     }
 288     return s;
 289 }
 290
 291 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 292 {
 293     int s, i;
 294     uint32_t *sq = ff_squareTbl + 256;
 295
 296     s = 0;
 297     for (i = 0; i < h; i++) {
 298         s += sq[pix1[0] - pix2[0]];
 299         s += sq[pix1[1] - pix2[1]];
 300         s += sq[pix1[2] - pix2[2]];
 301         s += sq[pix1[3] - pix2[3]];
 302         s += sq[pix1[4] - pix2[4]];
 303         s += sq[pix1[5] - pix2[5]];
 304         s += sq[pix1[6] - pix2[6]];
 305         s += sq[pix1[7] - pix2[7]];
 306         pix1 += line_size;
 307         pix2 += line_size;
 308     }
 309     return s;
 310 }
 311
 312 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 313 {
 314     int s, i;
 315     uint32_t *sq = ff_squareTbl + 256;
 316
 317     s = 0;
 318     for (i = 0; i < h; i++) {
 319         s += sq[pix1[ 0] - pix2[ 0]];
 320         s += sq[pix1[ 1] - pix2[ 1]];
 321         s += sq[pix1[ 2] - pix2[ 2]];
 322         s += sq[pix1[ 3] - pix2[ 3]];
 323         s += sq[pix1[ 4] - pix2[ 4]];
 324         s += sq[pix1[ 5] - pix2[ 5]];
 325         s += sq[pix1[ 6] - pix2[ 6]];
 326         s += sq[pix1[ 7] - pix2[ 7]];
 327         s += sq[pix1[ 8] - pix2[ 8]];
 328         s += sq[pix1[ 9] - pix2[ 9]];
 329         s += sq[pix1[10] - pix2[10]];
 330         s += sq[pix1[11] - pix2[11]];
 331         s += sq[pix1[12] - pix2[12]];
 332         s += sq[pix1[13] - pix2[13]];
 333         s += sq[pix1[14] - pix2[14]];
 334         s += sq[pix1[15] - pix2[15]];
 335
 336         pix1 += line_size;
 337         pix2 += line_size;
 338     }
 339     return s;
 340 }
 341
 342 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 343                           const uint8_t *s2, int stride){
 344     int i;
 345
 346     /* read the pixels */
 347     for(i=0;i<8;i++) {
 348         block[0] = s1[0] - s2[0];
 349         block[1] = s1[1] - s2[1];
 350         block[2] = s1[2] - s2[2];
 351         block[3] = s1[3] - s2[3];
 352         block[4] = s1[4] - s2[4];
 353         block[5] = s1[5] - s2[5];
 354         block[6] = s1[6] - s2[6];
 355         block[7] = s1[7] - s2[7];
 356         s1 += stride;
 357         s2 += stride;
 358         block += 8;
 359     }
 360 }
 361
 362
 363 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 364                              int line_size)
 365 {
 366     int i;
 367
 368     /* read the pixels */
 369     for(i=0;i<8;i++) {
 370         pixels[0] = av_clip_uint8(block[0]);
 371         pixels[1] = av_clip_uint8(block[1]);
 372         pixels[2] = av_clip_uint8(block[2]);
 373         pixels[3] = av_clip_uint8(block[3]);
 374         pixels[4] = av_clip_uint8(block[4]);
 375         pixels[5] = av_clip_uint8(block[5]);
 376         pixels[6] = av_clip_uint8(block[6]);
 377         pixels[7] = av_clip_uint8(block[7]);
 378
 379         pixels += line_size;
 380         block += 8;
 381     }
 382 }
 383
 384 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
 385                                     uint8_t *restrict pixels,
 386                                     int line_size)
 387 {
 388     int i, j;
 389
 390     for (i = 0; i < 8; i++) {
 391         for (j = 0; j < 8; j++) {
 392             if (*block < -128)
 393                 *pixels = 0;
 394             else if (*block > 127)
 395                 *pixels = 255;
 396             else
 397                 *pixels = (uint8_t)(*block + 128);
 398             block++;
 399             pixels++;
 400         }
 401         pixels += (line_size - 8);
 402     }
 403 }
 404
 405 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 406                              int line_size)
 407 {
 408     int i;
 409
 410     /* read the pixels */
 411     for(i=0;i<8;i++) {
 412         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 413         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 414         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 415         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 416         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
 417         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
 418         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
 419         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
 420         pixels += line_size;
 421         block += 8;
 422     }
 423 }
 424
 425 static int sum_abs_dctelem_c(DCTELEM *block)
 426 {
 427     int sum=0, i;
 428     for(i=0; i<64; i++)
 429         sum+= FFABS(block[i]);
 430     return sum;
 431 }
 432
 433 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 434 {
 435     int i;
 436
 437     for (i = 0; i < h; i++) {
 438         memset(block, value, 16);
 439         block += line_size;
 440     }
 441 }
 442
 443 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 444 {
 445     int i;
 446
 447     for (i = 0; i < h; i++) {
 448         memset(block, value, 8);
 449         block += line_size;
 450     }
 451 }
 452
 453 #define avg2(a,b) ((a+b+1)>>1)
 454 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 455
 456 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 457 {
 458     const int A=(16-x16)*(16-y16);
 459     const int B=(   x16)*(16-y16);
 460     const int C=(16-x16)*(   y16);
 461     const int D=(   x16)*(   y16);
 462     int i;
 463
 464     for(i=0; i<h; i++)
 465     {
 466         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 467         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 468         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 469         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 470         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 471         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 472         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 473         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 474         dst+= stride;
 475         src+= stride;
 476     }
 477 }
 478
 479 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 480                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 481 {
 482     int y, vx, vy;
 483     const int s= 1<<shift;
 484
 485     width--;
 486     height--;
 487
 488     for(y=0; y<h; y++){
 489         int x;
 490
 491         vx= ox;
 492         vy= oy;
 493         for(x=0; x<8; x++){ //XXX FIXME optimize
 494             int src_x, src_y, frac_x, frac_y, index;
 495
 496             src_x= vx>>16;
 497             src_y= vy>>16;
 498             frac_x= src_x&(s-1);
 499             frac_y= src_y&(s-1);
 500             src_x>>=shift;
 501             src_y>>=shift;
 502
 503             if((unsigned)src_x < width){
 504                 if((unsigned)src_y < height){
 505                     index= src_x + src_y*stride;
 506                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 507                                            + src[index       +1]*   frac_x )*(s-frac_y)
 508                                         + (  src[index+stride  ]*(s-frac_x)
 509                                            + src[index+stride+1]*   frac_x )*   frac_y
 510                                         + r)>>(shift*2);
 511                 }else{
 512                     index= src_x + av_clip(src_y, 0, height)*stride;
 513                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 514                                           + src[index       +1]*   frac_x )*s
 515                                         + r)>>(shift*2);
 516                 }
 517             }else{
 518                 if((unsigned)src_y < height){
 519                     index= av_clip(src_x, 0, width) + src_y*stride;
 520                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 521                                            + src[index+stride  ]*   frac_y )*s
 522                                         + r)>>(shift*2);
 523                 }else{
 524                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 525                     dst[y*stride + x]=    src[index         ];
 526                 }
 527             }
 528
 529             vx+= dxx;
 530             vy+= dyx;
 531         }
 532         ox += dxy;
 533         oy += dyy;
 534     }
 535 }
 536
 537 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 538     switch(width){
 539     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 540     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 541     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 542     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 543     }
 544 }
 545
 546 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 547     int i,j;
 548     for (i=0; i < height; i++) {
 549       for (j=0; j < width; j++) {
 550         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 551       }
 552       src += stride;
 553       dst += stride;
 554     }
 555 }
 556
 557 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 558     int i,j;
 559     for (i=0; i < height; i++) {
 560       for (j=0; j < width; j++) {
 561         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 562       }
 563       src += stride;
 564       dst += stride;
 565     }
 566 }
 567
 568 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 569     int i,j;
 570     for (i=0; i < height; i++) {
 571       for (j=0; j < width; j++) {
 572         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 573       }
 574       src += stride;
 575       dst += stride;
 576     }
 577 }
 578
 579 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 580     int i,j;
 581     for (i=0; i < height; i++) {
 582       for (j=0; j < width; j++) {
 583         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 584       }
 585       src += stride;
 586       dst += stride;
 587     }
 588 }
 589
 590 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 591     int i,j;
 592     for (i=0; i < height; i++) {
 593       for (j=0; j < width; j++) {
 594         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 595       }
 596       src += stride;
 597       dst += stride;
 598     }
 599 }
 600
 601 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 602     int i,j;
 603     for (i=0; i < height; i++) {
 604       for (j=0; j < width; j++) {
 605         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 606       }
 607       src += stride;
 608       dst += stride;
 609     }
 610 }
 611
 612 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 613     int i,j;
 614     for (i=0; i < height; i++) {
 615       for (j=0; j < width; j++) {
 616         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 617       }
 618       src += stride;
 619       dst += stride;
 620     }
 621 }
 622
 623 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 624     int i,j;
 625     for (i=0; i < height; i++) {
 626       for (j=0; j < width; j++) {
 627         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 628       }
 629       src += stride;
 630       dst += stride;
 631     }
 632 }
 633
 634 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 635     switch(width){
 636     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 637     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 638     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 639     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 640     }
 641 }
 642
 643 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 644     int i,j;
 645     for (i=0; i < height; i++) {
 646       for (j=0; j < width; j++) {
 647         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 648       }
 649       src += stride;
 650       dst += stride;
 651     }
 652 }
 653
 654 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 655     int i,j;
 656     for (i=0; i < height; i++) {
 657       for (j=0; j < width; j++) {
 658         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 659       }
 660       src += stride;
 661       dst += stride;
 662     }
 663 }
 664
 665 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 666     int i,j;
 667     for (i=0; i < height; i++) {
 668       for (j=0; j < width; j++) {
 669         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 670       }
 671       src += stride;
 672       dst += stride;
 673     }
 674 }
 675
 676 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 677     int i,j;
 678     for (i=0; i < height; i++) {
 679       for (j=0; j < width; j++) {
 680         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 681       }
 682       src += stride;
 683       dst += stride;
 684     }
 685 }
 686
 687 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 688     int i,j;
 689     for (i=0; i < height; i++) {
 690       for (j=0; j < width; j++) {
 691         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 692       }
 693       src += stride;
 694       dst += stride;
 695     }
 696 }
 697
 698 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 699     int i,j;
 700     for (i=0; i < height; i++) {
 701       for (j=0; j < width; j++) {
 702         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 703       }
 704       src += stride;
 705       dst += stride;
 706     }
 707 }
 708
 709 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 710     int i,j;
 711     for (i=0; i < height; i++) {
 712       for (j=0; j < width; j++) {
 713         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 714       }
 715       src += stride;
 716       dst += stride;
 717     }
 718 }
 719
 720 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 721     int i,j;
 722     for (i=0; i < height; i++) {
 723       for (j=0; j < width; j++) {
 724         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 725       }
 726       src += stride;
 727       dst += stride;
 728     }
 729 }
 730
 731 #define QPEL_MC(r, OPNAME, RND, OP) \
 732 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 733     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 734     int i;\
 735     for(i=0; i<h; i++)\
 736     {\
 737         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 738         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 739         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 740         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 741         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 742         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 743         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 744         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 745         dst+=dstStride;\
 746         src+=srcStride;\
 747     }\
 748 }\
 749 \
 750 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 751     const int w=8;\
 752     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 753     int i;\
 754     for(i=0; i<w; i++)\
 755     {\
 756         const int src0= src[0*srcStride];\
 757         const int src1= src[1*srcStride];\
 758         const int src2= src[2*srcStride];\
 759         const int src3= src[3*srcStride];\
 760         const int src4= src[4*srcStride];\
 761         const int src5= src[5*srcStride];\
 762         const int src6= src[6*srcStride];\
 763         const int src7= src[7*srcStride];\
 764         const int src8= src[8*srcStride];\
 765         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 766         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 767         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 768         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 769         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 770         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 771         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 772         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 773         dst++;\
 774         src++;\
 775     }\
 776 }\
 777 \
 778 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 779     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 780     int i;\
 781     \
 782     for(i=0; i<h; i++)\
 783     {\
 784         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 785         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 786         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 787         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 788         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 789         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 790         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 791         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 792         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 793         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 794         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 795         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 796         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 797         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 798         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 799         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 800         dst+=dstStride;\
 801         src+=srcStride;\
 802     }\
 803 }\
 804 \
 805 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 806     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 807     int i;\
 808     const int w=16;\
 809     for(i=0; i<w; i++)\
 810     {\
 811         const int src0= src[0*srcStride];\
 812         const int src1= src[1*srcStride];\
 813         const int src2= src[2*srcStride];\
 814         const int src3= src[3*srcStride];\
 815         const int src4= src[4*srcStride];\
 816         const int src5= src[5*srcStride];\
 817         const int src6= src[6*srcStride];\
 818         const int src7= src[7*srcStride];\
 819         const int src8= src[8*srcStride];\
 820         const int src9= src[9*srcStride];\
 821         const int src10= src[10*srcStride];\
 822         const int src11= src[11*srcStride];\
 823         const int src12= src[12*srcStride];\
 824         const int src13= src[13*srcStride];\
 825         const int src14= src[14*srcStride];\
 826         const int src15= src[15*srcStride];\
 827         const int src16= src[16*srcStride];\
 828         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 829         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 830         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 831         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 832         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 833         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 834         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 835         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 836         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 837         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 838         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 839         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 840         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 841         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 842         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 843         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 844         dst++;\
 845         src++;\
 846     }\
 847 }\
 848 \
 849 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
 850     uint8_t half[64];\
 851     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 852     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 853 }\
 854 \
 855 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
 856     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 857 }\
 858 \
 859 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
 860     uint8_t half[64];\
 861     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 862     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 863 }\
 864 \
 865 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
 866     uint8_t full[16*9];\
 867     uint8_t half[64];\
 868     copy_block9(full, src, 16, stride, 9);\
 869     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 870     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 871 }\
 872 \
 873 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
 874     uint8_t full[16*9];\
 875     copy_block9(full, src, 16, stride, 9);\
 876     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 877 }\
 878 \
 879 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
 880     uint8_t full[16*9];\
 881     uint8_t half[64];\
 882     copy_block9(full, src, 16, stride, 9);\
 883     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 884     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 885 }\
 886 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
 887     uint8_t full[16*9];\
 888     uint8_t halfH[72];\
 889     uint8_t halfV[64];\
 890     uint8_t halfHV[64];\
 891     copy_block9(full, src, 16, stride, 9);\
 892     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 893     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 894     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 895     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 896 }\
 897 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
 898     uint8_t full[16*9];\
 899     uint8_t halfH[72];\
 900     uint8_t halfHV[64];\
 901     copy_block9(full, src, 16, stride, 9);\
 902     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 903     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 904     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 905     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 906 }\
 907 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
 908     uint8_t full[16*9];\
 909     uint8_t halfH[72];\
 910     uint8_t halfV[64];\
 911     uint8_t halfHV[64];\
 912     copy_block9(full, src, 16, stride, 9);\
 913     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 914     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 915     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 916     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 917 }\
 918 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
 919     uint8_t full[16*9];\
 920     uint8_t halfH[72];\
 921     uint8_t halfHV[64];\
 922     copy_block9(full, src, 16, stride, 9);\
 923     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 924     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 925     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 926     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 927 }\
 928 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
 929     uint8_t full[16*9];\
 930     uint8_t halfH[72];\
 931     uint8_t halfV[64];\
 932     uint8_t halfHV[64];\
 933     copy_block9(full, src, 16, stride, 9);\
 934     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 935     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 936     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 937     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 938 }\
 939 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
 940     uint8_t full[16*9];\
 941     uint8_t halfH[72];\
 942     uint8_t halfHV[64];\
 943     copy_block9(full, src, 16, stride, 9);\
 944     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 945     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 946     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 947     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 948 }\
 949 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
 950     uint8_t full[16*9];\
 951     uint8_t halfH[72];\
 952     uint8_t halfV[64];\
 953     uint8_t halfHV[64];\
 954     copy_block9(full, src, 16, stride, 9);\
 955     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
 956     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 957     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 958     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 959 }\
 960 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
 961     uint8_t full[16*9];\
 962     uint8_t halfH[72];\
 963     uint8_t halfHV[64];\
 964     copy_block9(full, src, 16, stride, 9);\
 965     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 966     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 967     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 968     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 969 }\
 970 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
 971     uint8_t halfH[72];\
 972     uint8_t halfHV[64];\
 973     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
 974     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 975     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 976 }\
 977 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
 978     uint8_t halfH[72];\
 979     uint8_t halfHV[64];\
 980     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
 981     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 982     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 983 }\
 984 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
 985     uint8_t full[16*9];\
 986     uint8_t halfH[72];\
 987     uint8_t halfV[64];\
 988     uint8_t halfHV[64];\
 989     copy_block9(full, src, 16, stride, 9);\
 990     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 991     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 992     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 993     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
 994 }\
 995 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
 996     uint8_t full[16*9];\
 997     uint8_t halfH[72];\
 998     copy_block9(full, src, 16, stride, 9);\
 999     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1000     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1001     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1002 }\
1003 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1004     uint8_t full[16*9];\
1005     uint8_t halfH[72];\
1006     uint8_t halfV[64];\
1007     uint8_t halfHV[64];\
1008     copy_block9(full, src, 16, stride, 9);\
1009     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1010     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1011     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1012     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1013 }\
1014 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1015     uint8_t full[16*9];\
1016     uint8_t halfH[72];\
1017     copy_block9(full, src, 16, stride, 9);\
1018     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1019     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1020     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1021 }\
1022 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1023     uint8_t halfH[72];\
1024     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1025     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1026 }\
1027 \
1028 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1029     uint8_t half[256];\
1030     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1031     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1032 }\
1033 \
1034 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1035     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1036 }\
1037 \
1038 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1039     uint8_t half[256];\
1040     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1041     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1042 }\
1043 \
1044 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1045     uint8_t full[24*17];\
1046     uint8_t half[256];\
1047     copy_block17(full, src, 24, stride, 17);\
1048     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1049     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1050 }\
1051 \
1052 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1053     uint8_t full[24*17];\
1054     copy_block17(full, src, 24, stride, 17);\
1055     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1056 }\
1057 \
1058 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1059     uint8_t full[24*17];\
1060     uint8_t half[256];\
1061     copy_block17(full, src, 24, stride, 17);\
1062     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1063     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1064 }\
1065 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1066     uint8_t full[24*17];\
1067     uint8_t halfH[272];\
1068     uint8_t halfV[256];\
1069     uint8_t halfHV[256];\
1070     copy_block17(full, src, 24, stride, 17);\
1071     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1072     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1073     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1074     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1075 }\
1076 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1077     uint8_t full[24*17];\
1078     uint8_t halfH[272];\
1079     uint8_t halfHV[256];\
1080     copy_block17(full, src, 24, stride, 17);\
1081     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1082     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1083     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1084     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1085 }\
1086 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1087     uint8_t full[24*17];\
1088     uint8_t halfH[272];\
1089     uint8_t halfV[256];\
1090     uint8_t halfHV[256];\
1091     copy_block17(full, src, 24, stride, 17);\
1092     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1093     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1094     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1095     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1096 }\
1097 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1098     uint8_t full[24*17];\
1099     uint8_t halfH[272];\
1100     uint8_t halfHV[256];\
1101     copy_block17(full, src, 24, stride, 17);\
1102     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1103     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1104     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1105     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1106 }\
1107 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1108     uint8_t full[24*17];\
1109     uint8_t halfH[272];\
1110     uint8_t halfV[256];\
1111     uint8_t halfHV[256];\
1112     copy_block17(full, src, 24, stride, 17);\
1113     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1114     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1115     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1116     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1117 }\
1118 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1119     uint8_t full[24*17];\
1120     uint8_t halfH[272];\
1121     uint8_t halfHV[256];\
1122     copy_block17(full, src, 24, stride, 17);\
1123     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1124     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1125     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1126     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1127 }\
1128 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1129     uint8_t full[24*17];\
1130     uint8_t halfH[272];\
1131     uint8_t halfV[256];\
1132     uint8_t halfHV[256];\
1133     copy_block17(full, src, 24, stride, 17);\
1134     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1135     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1136     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1137     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1138 }\
1139 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1140     uint8_t full[24*17];\
1141     uint8_t halfH[272];\
1142     uint8_t halfHV[256];\
1143     copy_block17(full, src, 24, stride, 17);\
1144     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1145     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1146     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1147     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1148 }\
1149 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1150     uint8_t halfH[272];\
1151     uint8_t halfHV[256];\
1152     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1153     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1154     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1155 }\
1156 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1157     uint8_t halfH[272];\
1158     uint8_t halfHV[256];\
1159     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1160     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1161     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1162 }\
1163 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1164     uint8_t full[24*17];\
1165     uint8_t halfH[272];\
1166     uint8_t halfV[256];\
1167     uint8_t halfHV[256];\
1168     copy_block17(full, src, 24, stride, 17);\
1169     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1170     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1171     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1172     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1173 }\
1174 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1175     uint8_t full[24*17];\
1176     uint8_t halfH[272];\
1177     copy_block17(full, src, 24, stride, 17);\
1178     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1179     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1180     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1181 }\
1182 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1183     uint8_t full[24*17];\
1184     uint8_t halfH[272];\
1185     uint8_t halfV[256];\
1186     uint8_t halfHV[256];\
1187     copy_block17(full, src, 24, stride, 17);\
1188     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1189     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1190     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1191     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1192 }\
1193 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1194     uint8_t full[24*17];\
1195     uint8_t halfH[272];\
1196     copy_block17(full, src, 24, stride, 17);\
1197     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1198     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1199     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1200 }\
1201 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1202     uint8_t halfH[272];\
1203     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1204     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1205 }
1206
1207 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1208 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1209 #define op_put(a, b) a = cm[((b) + 16)>>5]
1210 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1211
1212 QPEL_MC(0, put_       , _       , op_put)
1213 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1214 QPEL_MC(0, avg_       , _       , op_avg)
1215 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1216 #undef op_avg
1217 #undef op_avg_no_rnd
1218 #undef op_put
1219 #undef op_put_no_rnd
1220
1221 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1222 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1223 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1224 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1225 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1226 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1227
1228 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1229     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1230     int i;
1231
1232     for(i=0; i<h; i++){
1233         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1234         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1235         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1236         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1237         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1238         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1239         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1240         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1241         dst+=dstStride;
1242         src+=srcStride;
1243     }
1244 }
1245
1246 #if CONFIG_RV40_DECODER
1247 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1248     put_pixels16_xy2_8_c(dst, src, stride, 16);
1249 }
1250 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1251     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1252 }
1253 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1254     put_pixels8_xy2_8_c(dst, src, stride, 8);
1255 }
1256 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1257     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1258 }
1259 #endif /* CONFIG_RV40_DECODER */
1260
1261 #if CONFIG_DIRAC_DECODER
1262 #define DIRAC_MC(OPNAME)\
1263 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1264 {\
1265      OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1266 }\
1267 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1268 {\
1269     OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1270 }\
1271 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1272 {\
1273     OPNAME ## _pixels16_8_c(dst   , src[0]   , stride, h);\
1274     OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1275 }\
1276 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1277 {\
1278     OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1279 }\
1280 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1281 {\
1282     OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1283 }\
1284 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1285 {\
1286     OPNAME ## _pixels16_l2_8(dst   , src[0]   , src[1]   , stride, stride, stride, h);\
1287     OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1288 }\
1289 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1290 {\
1291     OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1292 }\
1293 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1294 {\
1295     OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1296 }\
1297 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1298 {\
1299     OPNAME ## _pixels16_l4_8(dst   , src[0]   , src[1]   , src[2]   , src[3]   , stride, stride, stride, stride, stride, h);\
1300     OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1301 }
1302 DIRAC_MC(put)
1303 DIRAC_MC(avg)
1304 #endif
1305
1306 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1307     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1308     int i;
1309
1310     for(i=0; i<w; i++){
1311         const int src_1= src[ -srcStride];
1312         const int src0 = src[0          ];
1313         const int src1 = src[  srcStride];
1314         const int src2 = src[2*srcStride];
1315         const int src3 = src[3*srcStride];
1316         const int src4 = src[4*srcStride];
1317         const int src5 = src[5*srcStride];
1318         const int src6 = src[6*srcStride];
1319         const int src7 = src[7*srcStride];
1320         const int src8 = src[8*srcStride];
1321         const int src9 = src[9*srcStride];
1322         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1323         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1324         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1325         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1326         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1327         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1328         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1329         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1330         src++;
1331         dst++;
1332     }
1333 }
1334
1335 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1336     uint8_t half[64];
1337     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1338     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1339 }
1340
1341 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1342     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1343 }
1344
1345 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1346     uint8_t half[64];
1347     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1348     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1349 }
1350
1351 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1352     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1353 }
1354
1355 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1356     uint8_t halfH[88];
1357     uint8_t halfV[64];
1358     uint8_t halfHV[64];
1359     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1360     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1361     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1362     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1363 }
1364 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1365     uint8_t halfH[88];
1366     uint8_t halfV[64];
1367     uint8_t halfHV[64];
1368     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1369     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1370     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1371     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1372 }
1373 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1374     uint8_t halfH[88];
1375     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1376     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1377 }
1378
1379 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1380     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1381     int x;
1382     const int strength= ff_h263_loop_filter_strength[qscale];
1383
1384     for(x=0; x<8; x++){
1385         int d1, d2, ad1;
1386         int p0= src[x-2*stride];
1387         int p1= src[x-1*stride];
1388         int p2= src[x+0*stride];
1389         int p3= src[x+1*stride];
1390         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1391
1392         if     (d<-2*strength) d1= 0;
1393         else if(d<-  strength) d1=-2*strength - d;
1394         else if(d<   strength) d1= d;
1395         else if(d< 2*strength) d1= 2*strength - d;
1396         else                   d1= 0;
1397
1398         p1 += d1;
1399         p2 -= d1;
1400         if(p1&256) p1= ~(p1>>31);
1401         if(p2&256) p2= ~(p2>>31);
1402
1403         src[x-1*stride] = p1;
1404         src[x+0*stride] = p2;
1405
1406         ad1= FFABS(d1)>>1;
1407
1408         d2= av_clip((p0-p3)/4, -ad1, ad1);
1409
1410         src[x-2*stride] = p0 - d2;
1411         src[x+  stride] = p3 + d2;
1412     }
1413     }
1414 }
1415
1416 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1417     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1418     int y;
1419     const int strength= ff_h263_loop_filter_strength[qscale];
1420
1421     for(y=0; y<8; y++){
1422         int d1, d2, ad1;
1423         int p0= src[y*stride-2];
1424         int p1= src[y*stride-1];
1425         int p2= src[y*stride+0];
1426         int p3= src[y*stride+1];
1427         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1428
1429         if     (d<-2*strength) d1= 0;
1430         else if(d<-  strength) d1=-2*strength - d;
1431         else if(d<   strength) d1= d;
1432         else if(d< 2*strength) d1= 2*strength - d;
1433         else                   d1= 0;
1434
1435         p1 += d1;
1436         p2 -= d1;
1437         if(p1&256) p1= ~(p1>>31);
1438         if(p2&256) p2= ~(p2>>31);
1439
1440         src[y*stride-1] = p1;
1441         src[y*stride+0] = p2;
1442
1443         ad1= FFABS(d1)>>1;
1444
1445         d2= av_clip((p0-p3)/4, -ad1, ad1);
1446
1447         src[y*stride-2] = p0 - d2;
1448         src[y*stride+1] = p3 + d2;
1449     }
1450     }
1451 }
1452
1453 static void h261_loop_filter_c(uint8_t *src, int stride){
1454     int x,y,xy,yz;
1455     int temp[64];
1456
1457     for(x=0; x<8; x++){
1458         temp[x      ] = 4*src[x           ];
1459         temp[x + 7*8] = 4*src[x + 7*stride];
1460     }
1461     for(y=1; y<7; y++){
1462         for(x=0; x<8; x++){
1463             xy = y * stride + x;
1464             yz = y * 8 + x;
1465             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1466         }
1467     }
1468
1469     for(y=0; y<8; y++){
1470         src[  y*stride] = (temp[  y*8] + 2)>>2;
1471         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1472         for(x=1; x<7; x++){
1473             xy = y * stride + x;
1474             yz = y * 8 + x;
1475             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1476         }
1477     }
1478 }
1479
1480 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1481 {
1482     int s, i;
1483
1484     s = 0;
1485     for(i=0;i<h;i++) {
1486         s += abs(pix1[0] - pix2[0]);
1487         s += abs(pix1[1] - pix2[1]);
1488         s += abs(pix1[2] - pix2[2]);
1489         s += abs(pix1[3] - pix2[3]);
1490         s += abs(pix1[4] - pix2[4]);
1491         s += abs(pix1[5] - pix2[5]);
1492         s += abs(pix1[6] - pix2[6]);
1493         s += abs(pix1[7] - pix2[7]);
1494         s += abs(pix1[8] - pix2[8]);
1495         s += abs(pix1[9] - pix2[9]);
1496         s += abs(pix1[10] - pix2[10]);
1497         s += abs(pix1[11] - pix2[11]);
1498         s += abs(pix1[12] - pix2[12]);
1499         s += abs(pix1[13] - pix2[13]);
1500         s += abs(pix1[14] - pix2[14]);
1501         s += abs(pix1[15] - pix2[15]);
1502         pix1 += line_size;
1503         pix2 += line_size;
1504     }
1505     return s;
1506 }
1507
1508 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1509 {
1510     int s, i;
1511
1512     s = 0;
1513     for(i=0;i<h;i++) {
1514         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1515         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1516         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1517         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1518         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1519         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1520         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1521         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1522         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1523         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1524         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1525         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1526         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1527         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1528         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1529         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1530         pix1 += line_size;
1531         pix2 += line_size;
1532     }
1533     return s;
1534 }
1535
1536 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1537 {
1538     int s, i;
1539     uint8_t *pix3 = pix2 + line_size;
1540
1541     s = 0;
1542     for(i=0;i<h;i++) {
1543         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1544         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1545         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1546         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1547         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1548         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1549         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1550         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1551         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1552         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1553         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1554         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1555         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1556         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1557         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1558         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1559         pix1 += line_size;
1560         pix2 += line_size;
1561         pix3 += line_size;
1562     }
1563     return s;
1564 }
1565
1566 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1567 {
1568     int s, i;
1569     uint8_t *pix3 = pix2 + line_size;
1570
1571     s = 0;
1572     for(i=0;i<h;i++) {
1573         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1574         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1575         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1576         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1577         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1578         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1579         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1580         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1581         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1582         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1583         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1584         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1585         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1586         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1587         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1588         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1589         pix1 += line_size;
1590         pix2 += line_size;
1591         pix3 += line_size;
1592     }
1593     return s;
1594 }
1595
1596 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1597 {
1598     int s, i;
1599
1600     s = 0;
1601     for(i=0;i<h;i++) {
1602         s += abs(pix1[0] - pix2[0]);
1603         s += abs(pix1[1] - pix2[1]);
1604         s += abs(pix1[2] - pix2[2]);
1605         s += abs(pix1[3] - pix2[3]);
1606         s += abs(pix1[4] - pix2[4]);
1607         s += abs(pix1[5] - pix2[5]);
1608         s += abs(pix1[6] - pix2[6]);
1609         s += abs(pix1[7] - pix2[7]);
1610         pix1 += line_size;
1611         pix2 += line_size;
1612     }
1613     return s;
1614 }
1615
1616 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1617 {
1618     int s, i;
1619
1620     s = 0;
1621     for(i=0;i<h;i++) {
1622         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1623         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1624         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1625         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1626         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1627         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1628         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1629         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1630         pix1 += line_size;
1631         pix2 += line_size;
1632     }
1633     return s;
1634 }
1635
1636 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1637 {
1638     int s, i;
1639     uint8_t *pix3 = pix2 + line_size;
1640
1641     s = 0;
1642     for(i=0;i<h;i++) {
1643         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1644         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1645         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1646         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1647         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1648         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1649         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1650         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1651         pix1 += line_size;
1652         pix2 += line_size;
1653         pix3 += line_size;
1654     }
1655     return s;
1656 }
1657
1658 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1659 {
1660     int s, i;
1661     uint8_t *pix3 = pix2 + line_size;
1662
1663     s = 0;
1664     for(i=0;i<h;i++) {
1665         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1666         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1667         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1668         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1669         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1670         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1671         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1672         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1673         pix1 += line_size;
1674         pix2 += line_size;
1675         pix3 += line_size;
1676     }
1677     return s;
1678 }
1679
1680 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1681     MpegEncContext *c = v;
1682     int score1=0;
1683     int score2=0;
1684     int x,y;
1685
1686     for(y=0; y<h; y++){
1687         for(x=0; x<16; x++){
1688             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1689         }
1690         if(y+1<h){
1691             for(x=0; x<15; x++){
1692                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1693                              - s1[x+1] + s1[x+1+stride])
1694                         -FFABS(  s2[x  ] - s2[x  +stride]
1695                              - s2[x+1] + s2[x+1+stride]);
1696             }
1697         }
1698         s1+= stride;
1699         s2+= stride;
1700     }
1701
1702     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1703     else  return score1 + FFABS(score2)*8;
1704 }
1705
1706 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1707     MpegEncContext *c = v;
1708     int score1=0;
1709     int score2=0;
1710     int x,y;
1711
1712     for(y=0; y<h; y++){
1713         for(x=0; x<8; x++){
1714             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1715         }
1716         if(y+1<h){
1717             for(x=0; x<7; x++){
1718                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1719                              - s1[x+1] + s1[x+1+stride])
1720                         -FFABS(  s2[x  ] - s2[x  +stride]
1721                              - s2[x+1] + s2[x+1+stride]);
1722             }
1723         }
1724         s1+= stride;
1725         s2+= stride;
1726     }
1727
1728     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1729     else  return score1 + FFABS(score2)*8;
1730 }
1731
1732 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1733     int i;
1734     unsigned int sum=0;
1735
1736     for(i=0; i<8*8; i++){
1737         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1738         int w= weight[i];
1739         b>>= RECON_SHIFT;
1740         assert(-512<b && b<512);
1741
1742         sum += (w*b)*(w*b)>>4;
1743     }
1744     return sum>>2;
1745 }
1746
1747 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1748     int i;
1749
1750     for(i=0; i<8*8; i++){
1751         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1752     }
1753 }
1754
1755 /**
1756  * Permute an 8x8 block.
1757  * @param block the block which will be permuted according to the given permutation vector
1758  * @param permutation the permutation vector
1759  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1760  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1761  *                  (inverse) permutated to scantable order!
1762  */
1763 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1764 {
1765     int i;
1766     DCTELEM temp[64];
1767
1768     if(last<=0) return;
1769     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1770
1771     for(i=0; i<=last; i++){
1772         const int j= scantable[i];
1773         temp[j]= block[j];
1774         block[j]=0;
1775     }
1776
1777     for(i=0; i<=last; i++){
1778         const int j= scantable[i];
1779         const int perm_j= permutation[j];
1780         block[perm_j]= temp[j];
1781     }
1782 }
1783
1784 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1785     return 0;
1786 }
1787
1788 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1789     int i;
1790
1791     memset(cmp, 0, sizeof(void*)*6);
1792
1793     for(i=0; i<6; i++){
1794         switch(type&0xFF){
1795         case FF_CMP_SAD:
1796             cmp[i]= c->sad[i];
1797             break;
1798         case FF_CMP_SATD:
1799             cmp[i]= c->hadamard8_diff[i];
1800             break;
1801         case FF_CMP_SSE:
1802             cmp[i]= c->sse[i];
1803             break;
1804         case FF_CMP_DCT:
1805             cmp[i]= c->dct_sad[i];
1806             break;
1807         case FF_CMP_DCT264:
1808             cmp[i]= c->dct264_sad[i];
1809             break;
1810         case FF_CMP_DCTMAX:
1811             cmp[i]= c->dct_max[i];
1812             break;
1813         case FF_CMP_PSNR:
1814             cmp[i]= c->quant_psnr[i];
1815             break;
1816         case FF_CMP_BIT:
1817             cmp[i]= c->bit[i];
1818             break;
1819         case FF_CMP_RD:
1820             cmp[i]= c->rd[i];
1821             break;
1822         case FF_CMP_VSAD:
1823             cmp[i]= c->vsad[i];
1824             break;
1825         case FF_CMP_VSSE:
1826             cmp[i]= c->vsse[i];
1827             break;
1828         case FF_CMP_ZERO:
1829             cmp[i]= zero_cmp;
1830             break;
1831         case FF_CMP_NSSE:
1832             cmp[i]= c->nsse[i];
1833             break;
1834 #if CONFIG_DWT
1835         case FF_CMP_W53:
1836             cmp[i]= c->w53[i];
1837             break;
1838         case FF_CMP_W97:
1839             cmp[i]= c->w97[i];
1840             break;
1841 #endif
1842         default:
1843             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1844         }
1845     }
1846 }
1847
1848 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1849     long i;
1850     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1851         long a = *(long*)(src+i);
1852         long b = *(long*)(dst+i);
1853         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1854     }
1855     for(; i<w; i++)
1856         dst[i+0] += src[i+0];
1857 }
1858
1859 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1860     long i;
1861 #if !HAVE_FAST_UNALIGNED
1862     if((long)src2 & (sizeof(long)-1)){
1863         for(i=0; i+7<w; i+=8){
1864             dst[i+0] = src1[i+0]-src2[i+0];
1865             dst[i+1] = src1[i+1]-src2[i+1];
1866             dst[i+2] = src1[i+2]-src2[i+2];
1867             dst[i+3] = src1[i+3]-src2[i+3];
1868             dst[i+4] = src1[i+4]-src2[i+4];
1869             dst[i+5] = src1[i+5]-src2[i+5];
1870             dst[i+6] = src1[i+6]-src2[i+6];
1871             dst[i+7] = src1[i+7]-src2[i+7];
1872         }
1873     }else
1874 #endif
1875     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1876         long a = *(long*)(src1+i);
1877         long b = *(long*)(src2+i);
1878         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1879     }
1880     for(; i<w; i++)
1881         dst[i+0] = src1[i+0]-src2[i+0];
1882 }
1883
1884 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1885     int i;
1886     uint8_t l, lt;
1887
1888     l= *left;
1889     lt= *left_top;
1890
1891     for(i=0; i<w; i++){
1892         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1893         lt= src1[i];
1894         dst[i]= l;
1895     }
1896
1897     *left= l;
1898     *left_top= lt;
1899 }
1900
1901 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1902     int i;
1903     uint8_t l, lt;
1904
1905     l= *left;
1906     lt= *left_top;
1907
1908     for(i=0; i<w; i++){
1909         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1910         lt= src1[i];
1911         l= src2[i];
1912         dst[i]= l - pred;
1913     }
1914
1915     *left= l;
1916     *left_top= lt;
1917 }
1918
1919 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1920     int i;
1921
1922     for(i=0; i<w-1; i++){
1923         acc+= src[i];
1924         dst[i]= acc;
1925         i++;
1926         acc+= src[i];
1927         dst[i]= acc;
1928     }
1929
1930     for(; i<w; i++){
1931         acc+= src[i];
1932         dst[i]= acc;
1933     }
1934
1935     return acc;
1936 }
1937
1938 #if HAVE_BIGENDIAN
1939 #define B 3
1940 #define G 2
1941 #define R 1
1942 #define A 0
1943 #else
1944 #define B 0
1945 #define G 1
1946 #define R 2
1947 #define A 3
1948 #endif
1949 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1950     int i;
1951     int r,g,b,a;
1952     r= *red;
1953     g= *green;
1954     b= *blue;
1955     a= *alpha;
1956
1957     for(i=0; i<w; i++){
1958         b+= src[4*i+B];
1959         g+= src[4*i+G];
1960         r+= src[4*i+R];
1961         a+= src[4*i+A];
1962
1963         dst[4*i+B]= b;
1964         dst[4*i+G]= g;
1965         dst[4*i+R]= r;
1966         dst[4*i+A]= a;
1967     }
1968
1969     *red= r;
1970     *green= g;
1971     *blue= b;
1972     *alpha= a;
1973 }
1974 #undef B
1975 #undef G
1976 #undef R
1977 #undef A
1978
1979 #define BUTTERFLY2(o1,o2,i1,i2) \
1980 o1= (i1)+(i2);\
1981 o2= (i1)-(i2);
1982
1983 #define BUTTERFLY1(x,y) \
1984 {\
1985     int a,b;\
1986     a= x;\
1987     b= y;\
1988     x= a+b;\
1989     y= a-b;\
1990 }
1991
1992 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1993
1994 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1995     int i;
1996     int temp[64];
1997     int sum=0;
1998
1999     assert(h==8);
2000
2001     for(i=0; i<8; i++){
2002         //FIXME try pointer walks
2003         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2004         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2005         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2006         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2007
2008         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2009         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2010         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2011         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2012
2013         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2014         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2015         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2016         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2017     }
2018
2019     for(i=0; i<8; i++){
2020         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2021         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2022         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2023         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2024
2025         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2026         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2027         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2028         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2029
2030         sum +=
2031              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2032             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2033             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2034             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2035     }
2036     return sum;
2037 }
2038
2039 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2040     int i;
2041     int temp[64];
2042     int sum=0;
2043
2044     assert(h==8);
2045
2046     for(i=0; i<8; i++){
2047         //FIXME try pointer walks
2048         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2049         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2050         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2051         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2052
2053         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2054         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2055         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2056         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2057
2058         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2059         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2060         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2061         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2062     }
2063
2064     for(i=0; i<8; i++){
2065         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2066         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2067         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2068         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2069
2070         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2071         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2072         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2073         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2074
2075         sum +=
2076              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2077             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2078             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2079             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2080     }
2081
2082     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2083
2084     return sum;
2085 }
2086
2087 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2088     MpegEncContext * const s= (MpegEncContext *)c;
2089     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2090
2091     assert(h==8);
2092
2093     s->dsp.diff_pixels(temp, src1, src2, stride);
2094     s->dsp.fdct(temp);
2095     return s->dsp.sum_abs_dctelem(temp);
2096 }
2097
2098 #if CONFIG_GPL
2099 #define DCT8_1D {\
2100     const int s07 = SRC(0) + SRC(7);\
2101     const int s16 = SRC(1) + SRC(6);\
2102     const int s25 = SRC(2) + SRC(5);\
2103     const int s34 = SRC(3) + SRC(4);\
2104     const int a0 = s07 + s34;\
2105     const int a1 = s16 + s25;\
2106     const int a2 = s07 - s34;\
2107     const int a3 = s16 - s25;\
2108     const int d07 = SRC(0) - SRC(7);\
2109     const int d16 = SRC(1) - SRC(6);\
2110     const int d25 = SRC(2) - SRC(5);\
2111     const int d34 = SRC(3) - SRC(4);\
2112     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2113     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2114     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2115     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2116     DST(0,  a0 + a1     ) ;\
2117     DST(1,  a4 + (a7>>2)) ;\
2118     DST(2,  a2 + (a3>>1)) ;\
2119     DST(3,  a5 + (a6>>2)) ;\
2120     DST(4,  a0 - a1     ) ;\
2121     DST(5,  a6 - (a5>>2)) ;\
2122     DST(6, (a2>>1) - a3 ) ;\
2123     DST(7, (a4>>2) - a7 ) ;\
2124 }
2125
2126 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2127     MpegEncContext * const s= (MpegEncContext *)c;
2128     DCTELEM dct[8][8];
2129     int i;
2130     int sum=0;
2131
2132     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2133
2134 #define SRC(x) dct[i][x]
2135 #define DST(x,v) dct[i][x]= v
2136     for( i = 0; i < 8; i++ )
2137         DCT8_1D
2138 #undef SRC
2139 #undef DST
2140
2141 #define SRC(x) dct[x][i]
2142 #define DST(x,v) sum += FFABS(v)
2143     for( i = 0; i < 8; i++ )
2144         DCT8_1D
2145 #undef SRC
2146 #undef DST
2147     return sum;
2148 }
2149 #endif
2150
2151 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2152     MpegEncContext * const s= (MpegEncContext *)c;
2153     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2154     int sum=0, i;
2155
2156     assert(h==8);
2157
2158     s->dsp.diff_pixels(temp, src1, src2, stride);
2159     s->dsp.fdct(temp);
2160
2161     for(i=0; i<64; i++)
2162         sum= FFMAX(sum, FFABS(temp[i]));
2163
2164     return sum;
2165 }
2166
2167 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2168     MpegEncContext * const s= (MpegEncContext *)c;
2169     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2170     DCTELEM * const bak = temp+64;
2171     int sum=0, i;
2172
2173     assert(h==8);
2174     s->mb_intra=0;
2175
2176     s->dsp.diff_pixels(temp, src1, src2, stride);
2177
2178     memcpy(bak, temp, 64*sizeof(DCTELEM));
2179
2180     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2181     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2182     ff_simple_idct_8(temp); //FIXME
2183
2184     for(i=0; i<64; i++)
2185         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2186
2187     return sum;
2188 }
2189
2190 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2191     MpegEncContext * const s= (MpegEncContext *)c;
2192     const uint8_t *scantable= s->intra_scantable.permutated;
2193     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2194     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2195     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2196     int i, last, run, bits, level, distortion, start_i;
2197     const int esc_length= s->ac_esc_length;
2198     uint8_t * length;
2199     uint8_t * last_length;
2200
2201     assert(h==8);
2202
2203     copy_block8(lsrc1, src1, 8, stride, 8);
2204     copy_block8(lsrc2, src2, 8, stride, 8);
2205
2206     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2207
2208     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2209
2210     bits=0;
2211
2212     if (s->mb_intra) {
2213         start_i = 1;
2214         length     = s->intra_ac_vlc_length;
2215         last_length= s->intra_ac_vlc_last_length;
2216         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2217     } else {
2218         start_i = 0;
2219         length     = s->inter_ac_vlc_length;
2220         last_length= s->inter_ac_vlc_last_length;
2221     }
2222
2223     if(last>=start_i){
2224         run=0;
2225         for(i=start_i; i<last; i++){
2226             int j= scantable[i];
2227             level= temp[j];
2228
2229             if(level){
2230                 level+=64;
2231                 if((level&(~127)) == 0){
2232                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2233                 }else
2234                     bits+= esc_length;
2235                 run=0;
2236             }else
2237                 run++;
2238         }
2239         i= scantable[last];
2240
2241         level= temp[i] + 64;
2242
2243         assert(level - 64);
2244
2245         if((level&(~127)) == 0){
2246             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2247         }else
2248             bits+= esc_length;
2249
2250     }
2251
2252     if(last>=0){
2253         if(s->mb_intra)
2254             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2255         else
2256             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2257     }
2258
2259     s->dsp.idct_add(lsrc2, 8, temp);
2260
2261     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2262
2263     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2264 }
2265
2266 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2267     MpegEncContext * const s= (MpegEncContext *)c;
2268     const uint8_t *scantable= s->intra_scantable.permutated;
2269     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2270     int i, last, run, bits, level, start_i;
2271     const int esc_length= s->ac_esc_length;
2272     uint8_t * length;
2273     uint8_t * last_length;
2274
2275     assert(h==8);
2276
2277     s->dsp.diff_pixels(temp, src1, src2, stride);
2278
2279     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2280
2281     bits=0;
2282
2283     if (s->mb_intra) {
2284         start_i = 1;
2285         length     = s->intra_ac_vlc_length;
2286         last_length= s->intra_ac_vlc_last_length;
2287         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2288     } else {
2289         start_i = 0;
2290         length     = s->inter_ac_vlc_length;
2291         last_length= s->inter_ac_vlc_last_length;
2292     }
2293
2294     if(last>=start_i){
2295         run=0;
2296         for(i=start_i; i<last; i++){
2297             int j= scantable[i];
2298             level= temp[j];
2299
2300             if(level){
2301                 level+=64;
2302                 if((level&(~127)) == 0){
2303                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2304                 }else
2305                     bits+= esc_length;
2306                 run=0;
2307             }else
2308                 run++;
2309         }
2310         i= scantable[last];
2311
2312         level= temp[i] + 64;
2313
2314         assert(level - 64);
2315
2316         if((level&(~127)) == 0){
2317             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2318         }else
2319             bits+= esc_length;
2320     }
2321
2322     return bits;
2323 }
2324
2325 #define VSAD_INTRA(size) \
2326 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2327     int score=0;                                                                                            \
2328     int x,y;                                                                                                \
2329                                                                                                             \
2330     for(y=1; y<h; y++){                                                                                     \
2331         for(x=0; x<size; x+=4){                                                                             \
2332             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2333                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2334         }                                                                                                   \
2335         s+= stride;                                                                                         \
2336     }                                                                                                       \
2337                                                                                                             \
2338     return score;                                                                                           \
2339 }
2340 VSAD_INTRA(8)
2341 VSAD_INTRA(16)
2342
2343 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2344     int score=0;
2345     int x,y;
2346
2347     for(y=1; y<h; y++){
2348         for(x=0; x<16; x++){
2349             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2350         }
2351         s1+= stride;
2352         s2+= stride;
2353     }
2354
2355     return score;
2356 }
2357
2358 #define SQ(a) ((a)*(a))
2359 #define VSSE_INTRA(size) \
2360 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2361     int score=0;                                                                                            \
2362     int x,y;                                                                                                \
2363                                                                                                             \
2364     for(y=1; y<h; y++){                                                                                     \
2365         for(x=0; x<size; x+=4){                                                                               \
2366             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2367                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2368         }                                                                                                   \
2369         s+= stride;                                                                                         \
2370     }                                                                                                       \
2371                                                                                                             \
2372     return score;                                                                                           \
2373 }
2374 VSSE_INTRA(8)
2375 VSSE_INTRA(16)
2376
2377 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2378     int score=0;
2379     int x,y;
2380
2381     for(y=1; y<h; y++){
2382         for(x=0; x<16; x++){
2383             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2384         }
2385         s1+= stride;
2386         s2+= stride;
2387     }
2388
2389     return score;
2390 }
2391
2392 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2393                                int size){
2394     int score=0;
2395     int i;
2396     for(i=0; i<size; i++)
2397         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2398     return score;
2399 }
2400
2401 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2402 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2403 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2404 #if CONFIG_GPL
2405 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2406 #endif
2407 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2408 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2409 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2410 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2411
2412 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2413     int i;
2414     for(i=0; i<len; i++)
2415         dst[i] = src0[i] * src1[i];
2416 }
2417
2418 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2419     int i;
2420     src1 += len-1;
2421     for(i=0; i<len; i++)
2422         dst[i] = src0[i] * src1[-i];
2423 }
2424
2425 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2426     int i;
2427     for(i=0; i<len; i++)
2428         dst[i] = src0[i] * src1[i] + src2[i];
2429 }
2430
2431 static void vector_fmul_window_c(float *dst, const float *src0,
2432                                  const float *src1, const float *win, int len)
2433 {
2434     int i,j;
2435     dst += len;
2436     win += len;
2437     src0+= len;
2438     for(i=-len, j=len-1; i<0; i++, j--) {
2439         float s0 = src0[i];
2440         float s1 = src1[j];
2441         float wi = win[i];
2442         float wj = win[j];
2443         dst[i] = s0*wj - s1*wi;
2444         dst[j] = s0*wi + s1*wj;
2445     }
2446 }
2447
2448 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2449                                  int len)
2450 {
2451     int i;
2452     for (i = 0; i < len; i++)
2453         dst[i] = src[i] * mul;
2454 }
2455
2456 static void vector_fmac_scalar_c(float *dst, const float *src, float mul,
2457                                  int len)
2458 {
2459     int i;
2460     for (i = 0; i < len; i++)
2461         dst[i] += src[i] * mul;
2462 }
2463
2464 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2465                                 int len)
2466 {
2467     int i;
2468     for (i = 0; i < len; i++) {
2469         float t = v1[i] - v2[i];
2470         v1[i] += v2[i];
2471         v2[i] = t;
2472     }
2473 }
2474
2475 static void butterflies_float_interleave_c(float *dst, const float *src0,
2476                                            const float *src1, int len)
2477 {
2478     int i;
2479     for (i = 0; i < len; i++) {
2480         float f1 = src0[i];
2481         float f2 = src1[i];
2482         dst[2*i    ] = f1 + f2;
2483         dst[2*i + 1] = f1 - f2;
2484     }
2485 }
2486
2487 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2488 {
2489     float p = 0.0;
2490     int i;
2491
2492     for (i = 0; i < len; i++)
2493         p += v1[i] * v2[i];
2494
2495     return p;
2496 }
2497
2498 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2499                    uint32_t maxi, uint32_t maxisign)
2500 {
2501
2502     if(a > mini) return mini;
2503     else if((a^(1U<<31)) > maxisign) return maxi;
2504     else return a;
2505 }
2506
2507 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2508     int i;
2509     uint32_t mini = *(uint32_t*)min;
2510     uint32_t maxi = *(uint32_t*)max;
2511     uint32_t maxisign = maxi ^ (1U<<31);
2512     uint32_t *dsti = (uint32_t*)dst;
2513     const uint32_t *srci = (const uint32_t*)src;
2514     for(i=0; i<len; i+=8) {
2515         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2516         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2517         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2518         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2519         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2520         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2521         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2522         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2523     }
2524 }
2525 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2526     int i;
2527     if(min < 0 && max > 0) {
2528         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2529     } else {
2530         for(i=0; i < len; i+=8) {
2531             dst[i    ] = av_clipf(src[i    ], min, max);
2532             dst[i + 1] = av_clipf(src[i + 1], min, max);
2533             dst[i + 2] = av_clipf(src[i + 2], min, max);
2534             dst[i + 3] = av_clipf(src[i + 3], min, max);
2535             dst[i + 4] = av_clipf(src[i + 4], min, max);
2536             dst[i + 5] = av_clipf(src[i + 5], min, max);
2537             dst[i + 6] = av_clipf(src[i + 6], min, max);
2538             dst[i + 7] = av_clipf(src[i + 7], min, max);
2539         }
2540     }
2541 }
2542
2543 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2544 {
2545     int res = 0;
2546
2547     while (order--)
2548         res += *v1++ * *v2++;
2549
2550     return res;
2551 }
2552
2553 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2554 {
2555     int res = 0;
2556     while (order--) {
2557         res   += *v1 * *v2++;
2558         *v1++ += mul * *v3++;
2559     }
2560     return res;
2561 }
2562
2563 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2564                                  const int16_t *window, unsigned int len)
2565 {
2566     int i;
2567     int len2 = len >> 1;
2568
2569     for (i = 0; i < len2; i++) {
2570         int16_t w       = window[i];
2571         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2572         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2573     }
2574 }
2575
2576 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2577                                 int32_t max, unsigned int len)
2578 {
2579     do {
2580         *dst++ = av_clip(*src++, min, max);
2581         *dst++ = av_clip(*src++, min, max);
2582         *dst++ = av_clip(*src++, min, max);
2583         *dst++ = av_clip(*src++, min, max);
2584         *dst++ = av_clip(*src++, min, max);
2585         *dst++ = av_clip(*src++, min, max);
2586         *dst++ = av_clip(*src++, min, max);
2587         *dst++ = av_clip(*src++, min, max);
2588         len -= 8;
2589     } while (len > 0);
2590 }
2591
2592 #define W0 2048
2593 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2594 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2595 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2596 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2597 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2598 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2599 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2600
2601 static void wmv2_idct_row(short * b)
2602 {
2603     int s1,s2;
2604     int a0,a1,a2,a3,a4,a5,a6,a7;
2605     /*step 1*/
2606     a1 = W1*b[1]+W7*b[7];
2607     a7 = W7*b[1]-W1*b[7];
2608     a5 = W5*b[5]+W3*b[3];
2609     a3 = W3*b[5]-W5*b[3];
2610     a2 = W2*b[2]+W6*b[6];
2611     a6 = W6*b[2]-W2*b[6];
2612     a0 = W0*b[0]+W0*b[4];
2613     a4 = W0*b[0]-W0*b[4];
2614     /*step 2*/
2615     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2616     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2617     /*step 3*/
2618     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2619     b[1] = (a4+a6 +s1   + (1<<7))>>8;
2620     b[2] = (a4-a6 +s2   + (1<<7))>>8;
2621     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2622     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2623     b[5] = (a4-a6 -s2   + (1<<7))>>8;
2624     b[6] = (a4+a6 -s1   + (1<<7))>>8;
2625     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2626 }
2627 static void wmv2_idct_col(short * b)
2628 {
2629     int s1,s2;
2630     int a0,a1,a2,a3,a4,a5,a6,a7;
2631     /*step 1, with extended precision*/
2632     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2633     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2634     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2635     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2636     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2637     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2638     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2639     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2640     /*step 2*/
2641     s1 = (181*(a1-a5+a7-a3)+128)>>8;
2642     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2643     /*step 3*/
2644     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2645     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2646     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2647     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2648
2649     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2650     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2651     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2652     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2653 }
2654 void ff_wmv2_idct_c(short * block){
2655     int i;
2656
2657     for(i=0;i<64;i+=8){
2658         wmv2_idct_row(block+i);
2659     }
2660     for(i=0;i<8;i++){
2661         wmv2_idct_col(block+i);
2662     }
2663 }
2664 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2665  converted */
2666 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2667 {
2668     ff_wmv2_idct_c(block);
2669     ff_put_pixels_clamped_c(block, dest, line_size);
2670 }
2671 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2672 {
2673     ff_wmv2_idct_c(block);
2674     ff_add_pixels_clamped_c(block, dest, line_size);
2675 }
2676 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2677 {
2678     ff_j_rev_dct (block);
2679     ff_put_pixels_clamped_c(block, dest, line_size);
2680 }
2681 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2682 {
2683     ff_j_rev_dct (block);
2684     ff_add_pixels_clamped_c(block, dest, line_size);
2685 }
2686
2687 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2688
2689 /* init static data */
2690 av_cold void ff_dsputil_static_init(void)
2691 {
2692     int i;
2693
2694     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2695     for(i=0;i<MAX_NEG_CROP;i++) {
2696         ff_cropTbl[i] = 0;
2697         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2698     }
2699
2700     for(i=0;i<512;i++) {
2701         ff_squareTbl[i] = (i - 256) * (i - 256);
2702     }
2703
2704     for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2705 }
2706
2707 int ff_check_alignment(void){
2708     static int did_fail=0;
2709     LOCAL_ALIGNED_16(int, aligned, [4]);
2710
2711     if((intptr_t)aligned & 15){
2712         if(!did_fail){
2713 #if HAVE_MMX || HAVE_ALTIVEC
2714             av_log(NULL, AV_LOG_ERROR,
2715                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2716                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2717                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2718                 "Do not report crashes to FFmpeg developers.\n");
2719 #endif
2720             did_fail=1;
2721         }
2722         return -1;
2723     }
2724     return 0;
2725 }
2726
2727 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2728 {
2729     int i;
2730
2731     ff_check_alignment();
2732
2733 #if CONFIG_ENCODERS
2734     if (avctx->bits_per_raw_sample == 10) {
2735         c->fdct    = ff_jpeg_fdct_islow_10;
2736         c->fdct248 = ff_fdct248_islow_10;
2737     } else {
2738         if(avctx->dct_algo==FF_DCT_FASTINT) {
2739             c->fdct    = ff_fdct_ifast;
2740             c->fdct248 = ff_fdct_ifast248;
2741         }
2742         else if(avctx->dct_algo==FF_DCT_FAAN) {
2743             c->fdct    = ff_faandct;
2744             c->fdct248 = ff_faandct248;
2745         }
2746         else {
2747             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2748             c->fdct248 = ff_fdct248_islow_8;
2749         }
2750     }
2751 #endif //CONFIG_ENCODERS
2752
2753     if (avctx->bits_per_raw_sample == 10) {
2754         c->idct_put              = ff_simple_idct_put_10;
2755         c->idct_add              = ff_simple_idct_add_10;
2756         c->idct                  = ff_simple_idct_10;
2757         c->idct_permutation_type = FF_NO_IDCT_PERM;
2758     } else {
2759         if(avctx->idct_algo==FF_IDCT_INT){
2760             c->idct_put= ff_jref_idct_put;
2761             c->idct_add= ff_jref_idct_add;
2762             c->idct    = ff_j_rev_dct;
2763             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2764         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2765                 avctx->idct_algo==FF_IDCT_VP3){
2766             c->idct_put= ff_vp3_idct_put_c;
2767             c->idct_add= ff_vp3_idct_add_c;
2768             c->idct    = ff_vp3_idct_c;
2769             c->idct_permutation_type= FF_NO_IDCT_PERM;
2770         }else if(avctx->idct_algo==FF_IDCT_WMV2){
2771             c->idct_put= ff_wmv2_idct_put_c;
2772             c->idct_add= ff_wmv2_idct_add_c;
2773             c->idct    = ff_wmv2_idct_c;
2774             c->idct_permutation_type= FF_NO_IDCT_PERM;
2775         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2776             c->idct_put= ff_faanidct_put;
2777             c->idct_add= ff_faanidct_add;
2778             c->idct    = ff_faanidct;
2779             c->idct_permutation_type= FF_NO_IDCT_PERM;
2780         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2781             c->idct_put= ff_ea_idct_put_c;
2782             c->idct_permutation_type= FF_NO_IDCT_PERM;
2783         }else{ //accurate/default
2784             c->idct_put = ff_simple_idct_put_8;
2785             c->idct_add = ff_simple_idct_add_8;
2786             c->idct     = ff_simple_idct_8;
2787             c->idct_permutation_type= FF_NO_IDCT_PERM;
2788         }
2789     }
2790
2791     c->diff_pixels = diff_pixels_c;
2792     c->put_pixels_clamped = ff_put_pixels_clamped_c;
2793     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2794     c->add_pixels_clamped = ff_add_pixels_clamped_c;
2795     c->sum_abs_dctelem = sum_abs_dctelem_c;
2796     c->gmc1 = gmc1_c;
2797     c->gmc = ff_gmc_c;
2798     c->pix_sum = pix_sum_c;
2799     c->pix_norm1 = pix_norm1_c;
2800
2801     c->fill_block_tab[0] = fill_block16_c;
2802     c->fill_block_tab[1] = fill_block8_c;
2803
2804     /* TODO [0] 16  [1] 8 */
2805     c->pix_abs[0][0] = pix_abs16_c;
2806     c->pix_abs[0][1] = pix_abs16_x2_c;
2807     c->pix_abs[0][2] = pix_abs16_y2_c;
2808     c->pix_abs[0][3] = pix_abs16_xy2_c;
2809     c->pix_abs[1][0] = pix_abs8_c;
2810     c->pix_abs[1][1] = pix_abs8_x2_c;
2811     c->pix_abs[1][2] = pix_abs8_y2_c;
2812     c->pix_abs[1][3] = pix_abs8_xy2_c;
2813
2814     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2815     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2816     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2817     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2818     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2819     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2820     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2821     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2822     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2823
2824     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2825     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2826     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2827     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2828     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2829     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2830     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2831     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2832     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2833
2834 #define dspfunc(PFX, IDX, NUM) \
2835     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2836     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2837     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2838     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2839     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2840     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2841     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2842     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2843     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2844     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2845     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2846     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2847     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2848     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2849     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2850     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2851
2852     dspfunc(put_qpel, 0, 16);
2853     dspfunc(put_no_rnd_qpel, 0, 16);
2854
2855     dspfunc(avg_qpel, 0, 16);
2856     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2857
2858     dspfunc(put_qpel, 1, 8);
2859     dspfunc(put_no_rnd_qpel, 1, 8);
2860
2861     dspfunc(avg_qpel, 1, 8);
2862     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2863
2864 #undef dspfunc
2865
2866 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2867     ff_mlp_init(c, avctx);
2868 #endif
2869 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2870     ff_intrax8dsp_init(c,avctx);
2871 #endif
2872
2873     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2874     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2875     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2876     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2877     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2878     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2879     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2880     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2881
2882 #define SET_CMP_FUNC(name) \
2883     c->name[0]= name ## 16_c;\
2884     c->name[1]= name ## 8x8_c;
2885
2886     SET_CMP_FUNC(hadamard8_diff)
2887     c->hadamard8_diff[4]= hadamard8_intra16_c;
2888     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2889     SET_CMP_FUNC(dct_sad)
2890     SET_CMP_FUNC(dct_max)
2891 #if CONFIG_GPL
2892     SET_CMP_FUNC(dct264_sad)
2893 #endif
2894     c->sad[0]= pix_abs16_c;
2895     c->sad[1]= pix_abs8_c;
2896     c->sse[0]= sse16_c;
2897     c->sse[1]= sse8_c;
2898     c->sse[2]= sse4_c;
2899     SET_CMP_FUNC(quant_psnr)
2900     SET_CMP_FUNC(rd)
2901     SET_CMP_FUNC(bit)
2902     c->vsad[0]= vsad16_c;
2903     c->vsad[4]= vsad_intra16_c;
2904     c->vsad[5]= vsad_intra8_c;
2905     c->vsse[0]= vsse16_c;
2906     c->vsse[4]= vsse_intra16_c;
2907     c->vsse[5]= vsse_intra8_c;
2908     c->nsse[0]= nsse16_c;
2909     c->nsse[1]= nsse8_c;
2910 #if CONFIG_DWT
2911     ff_dsputil_init_dwt(c);
2912 #endif
2913
2914     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2915
2916     c->add_bytes= add_bytes_c;
2917     c->diff_bytes= diff_bytes_c;
2918     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2919     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2920     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
2921     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2922     c->bswap_buf= bswap_buf;
2923     c->bswap16_buf = bswap16_buf;
2924
2925     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2926         c->h263_h_loop_filter= h263_h_loop_filter_c;
2927         c->h263_v_loop_filter= h263_v_loop_filter_c;
2928     }
2929
2930     if (CONFIG_VP3_DECODER) {
2931         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
2932         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
2933         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
2934     }
2935
2936     c->h261_loop_filter= h261_loop_filter_c;
2937
2938     c->try_8x8basis= try_8x8basis_c;
2939     c->add_8x8basis= add_8x8basis_c;
2940
2941 #if CONFIG_VORBIS_DECODER
2942     c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling;
2943 #endif
2944 #if CONFIG_AC3_DECODER
2945     c->ac3_downmix = ff_ac3_downmix_c;
2946 #endif
2947     c->vector_fmul = vector_fmul_c;
2948     c->vector_fmul_reverse = vector_fmul_reverse_c;
2949     c->vector_fmul_add = vector_fmul_add_c;
2950     c->vector_fmul_window = vector_fmul_window_c;
2951     c->vector_clipf = vector_clipf_c;
2952     c->scalarproduct_int16 = scalarproduct_int16_c;
2953     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2954     c->apply_window_int16 = apply_window_int16_c;
2955     c->vector_clip_int32 = vector_clip_int32_c;
2956     c->scalarproduct_float = scalarproduct_float_c;
2957     c->butterflies_float = butterflies_float_c;
2958     c->butterflies_float_interleave = butterflies_float_interleave_c;
2959     c->vector_fmul_scalar = vector_fmul_scalar_c;
2960     c->vector_fmac_scalar = vector_fmac_scalar_c;
2961
2962     c->shrink[0]= av_image_copy_plane;
2963     c->shrink[1]= ff_shrink22;
2964     c->shrink[2]= ff_shrink44;
2965     c->shrink[3]= ff_shrink88;
2966
2967     c->prefetch= just_return;
2968
2969     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
2970     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
2971
2972 #undef FUNC
2973 #undef FUNCC
2974 #define FUNC(f, depth) f ## _ ## depth
2975 #define FUNCC(f, depth) f ## _ ## depth ## _c
2976
2977 #define dspfunc1(PFX, IDX, NUM, depth)\
2978     c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
2979     c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
2980     c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
2981     c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
2982
2983 #define dspfunc2(PFX, IDX, NUM, depth)\
2984     c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
2985     c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
2986     c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
2987     c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
2988     c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
2989     c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
2990     c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
2991     c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
2992     c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
2993     c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
2994     c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
2995     c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
2996     c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
2997     c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
2998     c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
2999     c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3000
3001
3002 #define BIT_DEPTH_FUNCS(depth, dct)\
3003     c->get_pixels                    = FUNCC(get_pixels   ## dct   , depth);\
3004     c->draw_edges                    = FUNCC(draw_edges            , depth);\
3005     c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
3006     c->clear_block                   = FUNCC(clear_block  ## dct   , depth);\
3007     c->clear_blocks                  = FUNCC(clear_blocks ## dct   , depth);\
3008     c->add_pixels8                   = FUNCC(add_pixels8  ## dct   , depth);\
3009     c->add_pixels4                   = FUNCC(add_pixels4  ## dct   , depth);\
3010     c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
3011     c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3012 \
3013     c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
3014     c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
3015     c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
3016     c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
3017     c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
3018     c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
3019 \
3020     dspfunc1(put       , 0, 16, depth);\
3021     dspfunc1(put       , 1,  8, depth);\
3022     dspfunc1(put       , 2,  4, depth);\
3023     dspfunc1(put       , 3,  2, depth);\
3024     dspfunc1(put_no_rnd, 0, 16, depth);\
3025     dspfunc1(put_no_rnd, 1,  8, depth);\
3026     dspfunc1(avg       , 0, 16, depth);\
3027     dspfunc1(avg       , 1,  8, depth);\
3028     dspfunc1(avg       , 2,  4, depth);\
3029     dspfunc1(avg       , 3,  2, depth);\
3030     dspfunc1(avg_no_rnd, 0, 16, depth);\
3031     dspfunc1(avg_no_rnd, 1,  8, depth);\
3032 \
3033     dspfunc2(put_h264_qpel, 0, 16, depth);\
3034     dspfunc2(put_h264_qpel, 1,  8, depth);\
3035     dspfunc2(put_h264_qpel, 2,  4, depth);\
3036     dspfunc2(put_h264_qpel, 3,  2, depth);\
3037     dspfunc2(avg_h264_qpel, 0, 16, depth);\
3038     dspfunc2(avg_h264_qpel, 1,  8, depth);\
3039     dspfunc2(avg_h264_qpel, 2,  4, depth);
3040
3041     switch (avctx->bits_per_raw_sample) {
3042     case 9:
3043         if (c->dct_bits == 32) {
3044             BIT_DEPTH_FUNCS(9, _32);
3045         } else {
3046             BIT_DEPTH_FUNCS(9, _16);
3047         }
3048         break;
3049     case 10:
3050         if (c->dct_bits == 32) {
3051             BIT_DEPTH_FUNCS(10, _32);
3052         } else {
3053             BIT_DEPTH_FUNCS(10, _16);
3054         }
3055         break;
3056     default:
3057         BIT_DEPTH_FUNCS(8, _16);
3058         break;
3059     }
3060
3061
3062     if (HAVE_MMX)        ff_dsputil_init_mmx   (c, avctx);
3063     if (ARCH_ARM)        ff_dsputil_init_arm   (c, avctx);
3064     if (HAVE_VIS)        ff_dsputil_init_vis   (c, avctx);
3065     if (ARCH_ALPHA)      ff_dsputil_init_alpha (c, avctx);
3066     if (ARCH_PPC)        ff_dsputil_init_ppc   (c, avctx);
3067     if (HAVE_MMI)        ff_dsputil_init_mmi   (c, avctx);
3068     if (ARCH_SH4)        ff_dsputil_init_sh4   (c, avctx);
3069     if (ARCH_BFIN)       ff_dsputil_init_bfin  (c, avctx);
3070
3071     for(i=0; i<64; i++){
3072         if(!c->put_2tap_qpel_pixels_tab[0][i])
3073             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3074         if(!c->avg_2tap_qpel_pixels_tab[0][i])
3075             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3076     }
3077
3078     ff_init_scantable_permutation(c->idct_permutation,
3079                                   c->idct_permutation_type);
3080 }
3081
3082 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3083 {
3084     ff_dsputil_init(c, avctx);
3085 }