git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "vorbis.h"
  40
  41 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  42 uint32_t ff_squareTbl[512] = {0, };
  43
  44 #define BIT_DEPTH 9
  45 #include "dsputil_template.c"
  46 #undef BIT_DEPTH
  47
  48 #define BIT_DEPTH 10
  49 #include "dsputil_template.c"
  50 #undef BIT_DEPTH
  51
  52 #define BIT_DEPTH 8
  53 #include "dsputil_template.c"
  54
  55 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  56 #define pb_7f (~0UL/255 * 0x7f)
  57 #define pb_80 (~0UL/255 * 0x80)
  58
  59 const uint8_t ff_zigzag_direct[64] = {
  60     0,   1,  8, 16,  9,  2,  3, 10,
  61     17, 24, 32, 25, 18, 11,  4,  5,
  62     12, 19, 26, 33, 40, 48, 41, 34,
  63     27, 20, 13,  6,  7, 14, 21, 28,
  64     35, 42, 49, 56, 57, 50, 43, 36,
  65     29, 22, 15, 23, 30, 37, 44, 51,
  66     58, 59, 52, 45, 38, 31, 39, 46,
  67     53, 60, 61, 54, 47, 55, 62, 63
  68 };
  69
  70 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  71    specification, we interleave the fields */
  72 const uint8_t ff_zigzag248_direct[64] = {
  73      0,  8,  1,  9, 16, 24,  2, 10,
  74     17, 25, 32, 40, 48, 56, 33, 41,
  75     18, 26,  3, 11,  4, 12, 19, 27,
  76     34, 42, 49, 57, 50, 58, 35, 43,
  77     20, 28,  5, 13,  6, 14, 21, 29,
  78     36, 44, 51, 59, 52, 60, 37, 45,
  79     22, 30,  7, 15, 23, 31, 38, 46,
  80     53, 61, 54, 62, 39, 47, 55, 63,
  81 };
  82
  83 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  84 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
  85
  86 const uint8_t ff_alternate_horizontal_scan[64] = {
  87     0,  1,   2,  3,  8,  9, 16, 17,
  88     10, 11,  4,  5,  6,  7, 15, 14,
  89     13, 12, 19, 18, 24, 25, 32, 33,
  90     26, 27, 20, 21, 22, 23, 28, 29,
  91     30, 31, 34, 35, 40, 41, 48, 49,
  92     42, 43, 36, 37, 38, 39, 44, 45,
  93     46, 47, 50, 51, 56, 57, 58, 59,
  94     52, 53, 54, 55, 60, 61, 62, 63,
  95 };
  96
  97 const uint8_t ff_alternate_vertical_scan[64] = {
  98     0,  8,  16, 24,  1,  9,  2, 10,
  99     17, 25, 32, 40, 48, 56, 57, 49,
 100     41, 33, 26, 18,  3, 11,  4, 12,
 101     19, 27, 34, 42, 50, 58, 35, 43,
 102     51, 59, 20, 28,  5, 13,  6, 14,
 103     21, 29, 36, 44, 52, 60, 37, 45,
 104     53, 61, 22, 30,  7, 15, 23, 31,
 105     38, 46, 54, 62, 39, 47, 55, 63,
 106 };
 107
 108 /* Input permutation for the simple_idct_mmx */
 109 static const uint8_t simple_mmx_permutation[64]={
 110         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 111         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 112         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 113         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 114         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 115         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 116         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 117         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 118 };
 119
 120 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 121
 122 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 123     int i;
 124     int end;
 125
 126     st->scantable= src_scantable;
 127
 128     for(i=0; i<64; i++){
 129         int j;
 130         j = src_scantable[i];
 131         st->permutated[i] = permutation[j];
 132     }
 133
 134     end=-1;
 135     for(i=0; i<64; i++){
 136         int j;
 137         j = st->permutated[i];
 138         if(j>end) end=j;
 139         st->raster_end[i]= end;
 140     }
 141 }
 142
 143 void ff_init_scantable_permutation(uint8_t *idct_permutation,
 144                                    int idct_permutation_type)
 145 {
 146     int i;
 147
 148     switch(idct_permutation_type){
 149     case FF_NO_IDCT_PERM:
 150         for(i=0; i<64; i++)
 151             idct_permutation[i]= i;
 152         break;
 153     case FF_LIBMPEG2_IDCT_PERM:
 154         for(i=0; i<64; i++)
 155             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 156         break;
 157     case FF_SIMPLE_IDCT_PERM:
 158         for(i=0; i<64; i++)
 159             idct_permutation[i]= simple_mmx_permutation[i];
 160         break;
 161     case FF_TRANSPOSE_IDCT_PERM:
 162         for(i=0; i<64; i++)
 163             idct_permutation[i]= ((i&7)<<3) | (i>>3);
 164         break;
 165     case FF_PARTTRANS_IDCT_PERM:
 166         for(i=0; i<64; i++)
 167             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
 168         break;
 169     case FF_SSE2_IDCT_PERM:
 170         for(i=0; i<64; i++)
 171             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
 172         break;
 173     default:
 174         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
 175     }
 176 }
 177
 178 static int pix_sum_c(uint8_t * pix, int line_size)
 179 {
 180     int s, i, j;
 181
 182     s = 0;
 183     for (i = 0; i < 16; i++) {
 184         for (j = 0; j < 16; j += 8) {
 185             s += pix[0];
 186             s += pix[1];
 187             s += pix[2];
 188             s += pix[3];
 189             s += pix[4];
 190             s += pix[5];
 191             s += pix[6];
 192             s += pix[7];
 193             pix += 8;
 194         }
 195         pix += line_size - 16;
 196     }
 197     return s;
 198 }
 199
 200 static int pix_norm1_c(uint8_t * pix, int line_size)
 201 {
 202     int s, i, j;
 203     uint32_t *sq = ff_squareTbl + 256;
 204
 205     s = 0;
 206     for (i = 0; i < 16; i++) {
 207         for (j = 0; j < 16; j += 8) {
 208 #if 0
 209             s += sq[pix[0]];
 210             s += sq[pix[1]];
 211             s += sq[pix[2]];
 212             s += sq[pix[3]];
 213             s += sq[pix[4]];
 214             s += sq[pix[5]];
 215             s += sq[pix[6]];
 216             s += sq[pix[7]];
 217 #else
 218 #if HAVE_FAST_64BIT
 219             register uint64_t x=*(uint64_t*)pix;
 220             s += sq[x&0xff];
 221             s += sq[(x>>8)&0xff];
 222             s += sq[(x>>16)&0xff];
 223             s += sq[(x>>24)&0xff];
 224             s += sq[(x>>32)&0xff];
 225             s += sq[(x>>40)&0xff];
 226             s += sq[(x>>48)&0xff];
 227             s += sq[(x>>56)&0xff];
 228 #else
 229             register uint32_t x=*(uint32_t*)pix;
 230             s += sq[x&0xff];
 231             s += sq[(x>>8)&0xff];
 232             s += sq[(x>>16)&0xff];
 233             s += sq[(x>>24)&0xff];
 234             x=*(uint32_t*)(pix+4);
 235             s += sq[x&0xff];
 236             s += sq[(x>>8)&0xff];
 237             s += sq[(x>>16)&0xff];
 238             s += sq[(x>>24)&0xff];
 239 #endif
 240 #endif
 241             pix += 8;
 242         }
 243         pix += line_size - 16;
 244     }
 245     return s;
 246 }
 247
 248 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 249     int i;
 250
 251     for(i=0; i+8<=w; i+=8){
 252         dst[i+0]= av_bswap32(src[i+0]);
 253         dst[i+1]= av_bswap32(src[i+1]);
 254         dst[i+2]= av_bswap32(src[i+2]);
 255         dst[i+3]= av_bswap32(src[i+3]);
 256         dst[i+4]= av_bswap32(src[i+4]);
 257         dst[i+5]= av_bswap32(src[i+5]);
 258         dst[i+6]= av_bswap32(src[i+6]);
 259         dst[i+7]= av_bswap32(src[i+7]);
 260     }
 261     for(;i<w; i++){
 262         dst[i+0]= av_bswap32(src[i+0]);
 263     }
 264 }
 265
 266 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 267 {
 268     while (len--)
 269         *dst++ = av_bswap16(*src++);
 270 }
 271
 272 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 273 {
 274     int s, i;
 275     uint32_t *sq = ff_squareTbl + 256;
 276
 277     s = 0;
 278     for (i = 0; i < h; i++) {
 279         s += sq[pix1[0] - pix2[0]];
 280         s += sq[pix1[1] - pix2[1]];
 281         s += sq[pix1[2] - pix2[2]];
 282         s += sq[pix1[3] - pix2[3]];
 283         pix1 += line_size;
 284         pix2 += line_size;
 285     }
 286     return s;
 287 }
 288
 289 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 290 {
 291     int s, i;
 292     uint32_t *sq = ff_squareTbl + 256;
 293
 294     s = 0;
 295     for (i = 0; i < h; i++) {
 296         s += sq[pix1[0] - pix2[0]];
 297         s += sq[pix1[1] - pix2[1]];
 298         s += sq[pix1[2] - pix2[2]];
 299         s += sq[pix1[3] - pix2[3]];
 300         s += sq[pix1[4] - pix2[4]];
 301         s += sq[pix1[5] - pix2[5]];
 302         s += sq[pix1[6] - pix2[6]];
 303         s += sq[pix1[7] - pix2[7]];
 304         pix1 += line_size;
 305         pix2 += line_size;
 306     }
 307     return s;
 308 }
 309
 310 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 311 {
 312     int s, i;
 313     uint32_t *sq = ff_squareTbl + 256;
 314
 315     s = 0;
 316     for (i = 0; i < h; i++) {
 317         s += sq[pix1[ 0] - pix2[ 0]];
 318         s += sq[pix1[ 1] - pix2[ 1]];
 319         s += sq[pix1[ 2] - pix2[ 2]];
 320         s += sq[pix1[ 3] - pix2[ 3]];
 321         s += sq[pix1[ 4] - pix2[ 4]];
 322         s += sq[pix1[ 5] - pix2[ 5]];
 323         s += sq[pix1[ 6] - pix2[ 6]];
 324         s += sq[pix1[ 7] - pix2[ 7]];
 325         s += sq[pix1[ 8] - pix2[ 8]];
 326         s += sq[pix1[ 9] - pix2[ 9]];
 327         s += sq[pix1[10] - pix2[10]];
 328         s += sq[pix1[11] - pix2[11]];
 329         s += sq[pix1[12] - pix2[12]];
 330         s += sq[pix1[13] - pix2[13]];
 331         s += sq[pix1[14] - pix2[14]];
 332         s += sq[pix1[15] - pix2[15]];
 333
 334         pix1 += line_size;
 335         pix2 += line_size;
 336     }
 337     return s;
 338 }
 339
 340 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 341                           const uint8_t *s2, int stride){
 342     int i;
 343
 344     /* read the pixels */
 345     for(i=0;i<8;i++) {
 346         block[0] = s1[0] - s2[0];
 347         block[1] = s1[1] - s2[1];
 348         block[2] = s1[2] - s2[2];
 349         block[3] = s1[3] - s2[3];
 350         block[4] = s1[4] - s2[4];
 351         block[5] = s1[5] - s2[5];
 352         block[6] = s1[6] - s2[6];
 353         block[7] = s1[7] - s2[7];
 354         s1 += stride;
 355         s2 += stride;
 356         block += 8;
 357     }
 358 }
 359
 360
 361 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 362                                  int line_size)
 363 {
 364     int i;
 365
 366     /* read the pixels */
 367     for(i=0;i<8;i++) {
 368         pixels[0] = av_clip_uint8(block[0]);
 369         pixels[1] = av_clip_uint8(block[1]);
 370         pixels[2] = av_clip_uint8(block[2]);
 371         pixels[3] = av_clip_uint8(block[3]);
 372         pixels[4] = av_clip_uint8(block[4]);
 373         pixels[5] = av_clip_uint8(block[5]);
 374         pixels[6] = av_clip_uint8(block[6]);
 375         pixels[7] = av_clip_uint8(block[7]);
 376
 377         pixels += line_size;
 378         block += 8;
 379     }
 380 }
 381
 382 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 383                                         uint8_t *restrict pixels,
 384                                         int line_size)
 385 {
 386     int i, j;
 387
 388     for (i = 0; i < 8; i++) {
 389         for (j = 0; j < 8; j++) {
 390             if (*block < -128)
 391                 *pixels = 0;
 392             else if (*block > 127)
 393                 *pixels = 255;
 394             else
 395                 *pixels = (uint8_t)(*block + 128);
 396             block++;
 397             pixels++;
 398         }
 399         pixels += (line_size - 8);
 400     }
 401 }
 402
 403 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 404                                  int line_size)
 405 {
 406     int i;
 407
 408     /* read the pixels */
 409     for(i=0;i<8;i++) {
 410         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 411         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 412         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 413         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 414         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
 415         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
 416         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
 417         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
 418         pixels += line_size;
 419         block += 8;
 420     }
 421 }
 422
 423 static int sum_abs_dctelem_c(DCTELEM *block)
 424 {
 425     int sum=0, i;
 426     for(i=0; i<64; i++)
 427         sum+= FFABS(block[i]);
 428     return sum;
 429 }
 430
 431 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 432 {
 433     int i;
 434
 435     for (i = 0; i < h; i++) {
 436         memset(block, value, 16);
 437         block += line_size;
 438     }
 439 }
 440
 441 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 442 {
 443     int i;
 444
 445     for (i = 0; i < h; i++) {
 446         memset(block, value, 8);
 447         block += line_size;
 448     }
 449 }
 450
 451 #define avg2(a,b) ((a+b+1)>>1)
 452 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 453
 454 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 455 {
 456     const int A=(16-x16)*(16-y16);
 457     const int B=(   x16)*(16-y16);
 458     const int C=(16-x16)*(   y16);
 459     const int D=(   x16)*(   y16);
 460     int i;
 461
 462     for(i=0; i<h; i++)
 463     {
 464         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 465         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 466         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 467         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 468         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 469         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 470         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 471         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 472         dst+= stride;
 473         src+= stride;
 474     }
 475 }
 476
 477 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 478                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 479 {
 480     int y, vx, vy;
 481     const int s= 1<<shift;
 482
 483     width--;
 484     height--;
 485
 486     for(y=0; y<h; y++){
 487         int x;
 488
 489         vx= ox;
 490         vy= oy;
 491         for(x=0; x<8; x++){ //XXX FIXME optimize
 492             int src_x, src_y, frac_x, frac_y, index;
 493
 494             src_x= vx>>16;
 495             src_y= vy>>16;
 496             frac_x= src_x&(s-1);
 497             frac_y= src_y&(s-1);
 498             src_x>>=shift;
 499             src_y>>=shift;
 500
 501             if((unsigned)src_x < width){
 502                 if((unsigned)src_y < height){
 503                     index= src_x + src_y*stride;
 504                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 505                                            + src[index       +1]*   frac_x )*(s-frac_y)
 506                                         + (  src[index+stride  ]*(s-frac_x)
 507                                            + src[index+stride+1]*   frac_x )*   frac_y
 508                                         + r)>>(shift*2);
 509                 }else{
 510                     index= src_x + av_clip(src_y, 0, height)*stride;
 511                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 512                                           + src[index       +1]*   frac_x )*s
 513                                         + r)>>(shift*2);
 514                 }
 515             }else{
 516                 if((unsigned)src_y < height){
 517                     index= av_clip(src_x, 0, width) + src_y*stride;
 518                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 519                                            + src[index+stride  ]*   frac_y )*s
 520                                         + r)>>(shift*2);
 521                 }else{
 522                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 523                     dst[y*stride + x]=    src[index         ];
 524                 }
 525             }
 526
 527             vx+= dxx;
 528             vy+= dyx;
 529         }
 530         ox += dxy;
 531         oy += dyy;
 532     }
 533 }
 534
 535 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 536     switch(width){
 537     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 538     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 539     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 540     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 541     }
 542 }
 543
 544 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 545     int i,j;
 546     for (i=0; i < height; i++) {
 547       for (j=0; j < width; j++) {
 548         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 549       }
 550       src += stride;
 551       dst += stride;
 552     }
 553 }
 554
 555 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 556     int i,j;
 557     for (i=0; i < height; i++) {
 558       for (j=0; j < width; j++) {
 559         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 560       }
 561       src += stride;
 562       dst += stride;
 563     }
 564 }
 565
 566 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 567     int i,j;
 568     for (i=0; i < height; i++) {
 569       for (j=0; j < width; j++) {
 570         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 571       }
 572       src += stride;
 573       dst += stride;
 574     }
 575 }
 576
 577 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 578     int i,j;
 579     for (i=0; i < height; i++) {
 580       for (j=0; j < width; j++) {
 581         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 582       }
 583       src += stride;
 584       dst += stride;
 585     }
 586 }
 587
 588 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 589     int i,j;
 590     for (i=0; i < height; i++) {
 591       for (j=0; j < width; j++) {
 592         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 593       }
 594       src += stride;
 595       dst += stride;
 596     }
 597 }
 598
 599 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 600     int i,j;
 601     for (i=0; i < height; i++) {
 602       for (j=0; j < width; j++) {
 603         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 604       }
 605       src += stride;
 606       dst += stride;
 607     }
 608 }
 609
 610 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 611     int i,j;
 612     for (i=0; i < height; i++) {
 613       for (j=0; j < width; j++) {
 614         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 615       }
 616       src += stride;
 617       dst += stride;
 618     }
 619 }
 620
 621 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 622     int i,j;
 623     for (i=0; i < height; i++) {
 624       for (j=0; j < width; j++) {
 625         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 626       }
 627       src += stride;
 628       dst += stride;
 629     }
 630 }
 631
 632 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 633     switch(width){
 634     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 635     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 636     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 637     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 638     }
 639 }
 640
 641 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 642     int i,j;
 643     for (i=0; i < height; i++) {
 644       for (j=0; j < width; j++) {
 645         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 646       }
 647       src += stride;
 648       dst += stride;
 649     }
 650 }
 651
 652 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 653     int i,j;
 654     for (i=0; i < height; i++) {
 655       for (j=0; j < width; j++) {
 656         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 657       }
 658       src += stride;
 659       dst += stride;
 660     }
 661 }
 662
 663 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 664     int i,j;
 665     for (i=0; i < height; i++) {
 666       for (j=0; j < width; j++) {
 667         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 668       }
 669       src += stride;
 670       dst += stride;
 671     }
 672 }
 673
 674 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 675     int i,j;
 676     for (i=0; i < height; i++) {
 677       for (j=0; j < width; j++) {
 678         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 679       }
 680       src += stride;
 681       dst += stride;
 682     }
 683 }
 684
 685 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 686     int i,j;
 687     for (i=0; i < height; i++) {
 688       for (j=0; j < width; j++) {
 689         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 690       }
 691       src += stride;
 692       dst += stride;
 693     }
 694 }
 695
 696 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 697     int i,j;
 698     for (i=0; i < height; i++) {
 699       for (j=0; j < width; j++) {
 700         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 701       }
 702       src += stride;
 703       dst += stride;
 704     }
 705 }
 706
 707 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 708     int i,j;
 709     for (i=0; i < height; i++) {
 710       for (j=0; j < width; j++) {
 711         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 712       }
 713       src += stride;
 714       dst += stride;
 715     }
 716 }
 717
 718 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 719     int i,j;
 720     for (i=0; i < height; i++) {
 721       for (j=0; j < width; j++) {
 722         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 723       }
 724       src += stride;
 725       dst += stride;
 726     }
 727 }
 728
 729 #define QPEL_MC(r, OPNAME, RND, OP) \
 730 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 731     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 732     int i;\
 733     for(i=0; i<h; i++)\
 734     {\
 735         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 736         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 737         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 738         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 739         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 740         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 741         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 742         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 743         dst+=dstStride;\
 744         src+=srcStride;\
 745     }\
 746 }\
 747 \
 748 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 749     const int w=8;\
 750     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 751     int i;\
 752     for(i=0; i<w; i++)\
 753     {\
 754         const int src0= src[0*srcStride];\
 755         const int src1= src[1*srcStride];\
 756         const int src2= src[2*srcStride];\
 757         const int src3= src[3*srcStride];\
 758         const int src4= src[4*srcStride];\
 759         const int src5= src[5*srcStride];\
 760         const int src6= src[6*srcStride];\
 761         const int src7= src[7*srcStride];\
 762         const int src8= src[8*srcStride];\
 763         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 764         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 765         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 766         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 767         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 768         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 769         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 770         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 771         dst++;\
 772         src++;\
 773     }\
 774 }\
 775 \
 776 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 777     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 778     int i;\
 779     \
 780     for(i=0; i<h; i++)\
 781     {\
 782         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 783         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 784         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 785         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 786         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 787         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 788         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 789         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 790         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 791         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 792         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 793         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 794         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 795         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 796         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 797         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 798         dst+=dstStride;\
 799         src+=srcStride;\
 800     }\
 801 }\
 802 \
 803 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 804     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 805     int i;\
 806     const int w=16;\
 807     for(i=0; i<w; i++)\
 808     {\
 809         const int src0= src[0*srcStride];\
 810         const int src1= src[1*srcStride];\
 811         const int src2= src[2*srcStride];\
 812         const int src3= src[3*srcStride];\
 813         const int src4= src[4*srcStride];\
 814         const int src5= src[5*srcStride];\
 815         const int src6= src[6*srcStride];\
 816         const int src7= src[7*srcStride];\
 817         const int src8= src[8*srcStride];\
 818         const int src9= src[9*srcStride];\
 819         const int src10= src[10*srcStride];\
 820         const int src11= src[11*srcStride];\
 821         const int src12= src[12*srcStride];\
 822         const int src13= src[13*srcStride];\
 823         const int src14= src[14*srcStride];\
 824         const int src15= src[15*srcStride];\
 825         const int src16= src[16*srcStride];\
 826         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 827         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 828         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 829         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 830         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 831         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 832         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 833         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 834         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 835         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 836         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 837         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 838         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 839         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 840         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 841         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 842         dst++;\
 843         src++;\
 844     }\
 845 }\
 846 \
 847 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
 848     uint8_t half[64];\
 849     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 850     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 851 }\
 852 \
 853 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
 854     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 855 }\
 856 \
 857 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
 858     uint8_t half[64];\
 859     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 860     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 861 }\
 862 \
 863 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
 864     uint8_t full[16*9];\
 865     uint8_t half[64];\
 866     copy_block9(full, src, 16, stride, 9);\
 867     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 868     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 869 }\
 870 \
 871 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
 872     uint8_t full[16*9];\
 873     copy_block9(full, src, 16, stride, 9);\
 874     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 875 }\
 876 \
 877 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
 878     uint8_t full[16*9];\
 879     uint8_t half[64];\
 880     copy_block9(full, src, 16, stride, 9);\
 881     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 882     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 883 }\
 884 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
 885     uint8_t full[16*9];\
 886     uint8_t halfH[72];\
 887     uint8_t halfV[64];\
 888     uint8_t halfHV[64];\
 889     copy_block9(full, src, 16, stride, 9);\
 890     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 891     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 892     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 893     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 894 }\
 895 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
 896     uint8_t full[16*9];\
 897     uint8_t halfH[72];\
 898     uint8_t halfHV[64];\
 899     copy_block9(full, src, 16, stride, 9);\
 900     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 901     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 902     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 903     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 904 }\
 905 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
 906     uint8_t full[16*9];\
 907     uint8_t halfH[72];\
 908     uint8_t halfV[64];\
 909     uint8_t halfHV[64];\
 910     copy_block9(full, src, 16, stride, 9);\
 911     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 912     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 913     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 914     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 915 }\
 916 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
 917     uint8_t full[16*9];\
 918     uint8_t halfH[72];\
 919     uint8_t halfHV[64];\
 920     copy_block9(full, src, 16, stride, 9);\
 921     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 922     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 923     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 924     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 925 }\
 926 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
 927     uint8_t full[16*9];\
 928     uint8_t halfH[72];\
 929     uint8_t halfV[64];\
 930     uint8_t halfHV[64];\
 931     copy_block9(full, src, 16, stride, 9);\
 932     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 933     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 934     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 935     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 936 }\
 937 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
 938     uint8_t full[16*9];\
 939     uint8_t halfH[72];\
 940     uint8_t halfHV[64];\
 941     copy_block9(full, src, 16, stride, 9);\
 942     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 943     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 944     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 945     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 946 }\
 947 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
 948     uint8_t full[16*9];\
 949     uint8_t halfH[72];\
 950     uint8_t halfV[64];\
 951     uint8_t halfHV[64];\
 952     copy_block9(full, src, 16, stride, 9);\
 953     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
 954     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 955     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 956     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 957 }\
 958 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
 959     uint8_t full[16*9];\
 960     uint8_t halfH[72];\
 961     uint8_t halfHV[64];\
 962     copy_block9(full, src, 16, stride, 9);\
 963     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 964     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 965     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 966     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 967 }\
 968 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
 969     uint8_t halfH[72];\
 970     uint8_t halfHV[64];\
 971     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
 972     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 973     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 974 }\
 975 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
 976     uint8_t halfH[72];\
 977     uint8_t halfHV[64];\
 978     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
 979     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 980     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 981 }\
 982 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
 983     uint8_t full[16*9];\
 984     uint8_t halfH[72];\
 985     uint8_t halfV[64];\
 986     uint8_t halfHV[64];\
 987     copy_block9(full, src, 16, stride, 9);\
 988     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 989     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 990     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 991     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
 992 }\
 993 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
 994     uint8_t full[16*9];\
 995     uint8_t halfH[72];\
 996     copy_block9(full, src, 16, stride, 9);\
 997     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 998     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 999     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1000 }\
1001 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1002     uint8_t full[16*9];\
1003     uint8_t halfH[72];\
1004     uint8_t halfV[64];\
1005     uint8_t halfHV[64];\
1006     copy_block9(full, src, 16, stride, 9);\
1007     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1008     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1009     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1010     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1011 }\
1012 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1013     uint8_t full[16*9];\
1014     uint8_t halfH[72];\
1015     copy_block9(full, src, 16, stride, 9);\
1016     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1017     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1018     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1019 }\
1020 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1021     uint8_t halfH[72];\
1022     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1023     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1024 }\
1025 \
1026 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1027     uint8_t half[256];\
1028     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1029     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1030 }\
1031 \
1032 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1033     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1034 }\
1035 \
1036 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1037     uint8_t half[256];\
1038     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1039     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1040 }\
1041 \
1042 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1043     uint8_t full[24*17];\
1044     uint8_t half[256];\
1045     copy_block17(full, src, 24, stride, 17);\
1046     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1047     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1048 }\
1049 \
1050 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1051     uint8_t full[24*17];\
1052     copy_block17(full, src, 24, stride, 17);\
1053     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1054 }\
1055 \
1056 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1057     uint8_t full[24*17];\
1058     uint8_t half[256];\
1059     copy_block17(full, src, 24, stride, 17);\
1060     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1061     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1062 }\
1063 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1064     uint8_t full[24*17];\
1065     uint8_t halfH[272];\
1066     uint8_t halfV[256];\
1067     uint8_t halfHV[256];\
1068     copy_block17(full, src, 24, stride, 17);\
1069     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1070     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1071     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1072     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1073 }\
1074 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1075     uint8_t full[24*17];\
1076     uint8_t halfH[272];\
1077     uint8_t halfHV[256];\
1078     copy_block17(full, src, 24, stride, 17);\
1079     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1080     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1081     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1082     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1083 }\
1084 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1085     uint8_t full[24*17];\
1086     uint8_t halfH[272];\
1087     uint8_t halfV[256];\
1088     uint8_t halfHV[256];\
1089     copy_block17(full, src, 24, stride, 17);\
1090     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1091     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1092     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1093     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1094 }\
1095 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1096     uint8_t full[24*17];\
1097     uint8_t halfH[272];\
1098     uint8_t halfHV[256];\
1099     copy_block17(full, src, 24, stride, 17);\
1100     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1101     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1102     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1103     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1104 }\
1105 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1106     uint8_t full[24*17];\
1107     uint8_t halfH[272];\
1108     uint8_t halfV[256];\
1109     uint8_t halfHV[256];\
1110     copy_block17(full, src, 24, stride, 17);\
1111     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1112     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1113     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1114     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1115 }\
1116 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1117     uint8_t full[24*17];\
1118     uint8_t halfH[272];\
1119     uint8_t halfHV[256];\
1120     copy_block17(full, src, 24, stride, 17);\
1121     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1122     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1123     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1124     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1125 }\
1126 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1127     uint8_t full[24*17];\
1128     uint8_t halfH[272];\
1129     uint8_t halfV[256];\
1130     uint8_t halfHV[256];\
1131     copy_block17(full, src, 24, stride, 17);\
1132     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1133     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1134     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1135     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1136 }\
1137 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1138     uint8_t full[24*17];\
1139     uint8_t halfH[272];\
1140     uint8_t halfHV[256];\
1141     copy_block17(full, src, 24, stride, 17);\
1142     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1143     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1144     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1145     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1146 }\
1147 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1148     uint8_t halfH[272];\
1149     uint8_t halfHV[256];\
1150     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1151     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1152     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1153 }\
1154 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1155     uint8_t halfH[272];\
1156     uint8_t halfHV[256];\
1157     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1158     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1159     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1160 }\
1161 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1162     uint8_t full[24*17];\
1163     uint8_t halfH[272];\
1164     uint8_t halfV[256];\
1165     uint8_t halfHV[256];\
1166     copy_block17(full, src, 24, stride, 17);\
1167     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1168     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1169     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1170     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1171 }\
1172 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1173     uint8_t full[24*17];\
1174     uint8_t halfH[272];\
1175     copy_block17(full, src, 24, stride, 17);\
1176     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1177     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1178     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1179 }\
1180 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1181     uint8_t full[24*17];\
1182     uint8_t halfH[272];\
1183     uint8_t halfV[256];\
1184     uint8_t halfHV[256];\
1185     copy_block17(full, src, 24, stride, 17);\
1186     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1187     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1188     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1189     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1190 }\
1191 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1192     uint8_t full[24*17];\
1193     uint8_t halfH[272];\
1194     copy_block17(full, src, 24, stride, 17);\
1195     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1196     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1197     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1198 }\
1199 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1200     uint8_t halfH[272];\
1201     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1202     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1203 }
1204
1205 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1206 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1207 #define op_put(a, b) a = cm[((b) + 16)>>5]
1208 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1209
1210 QPEL_MC(0, put_       , _       , op_put)
1211 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1212 QPEL_MC(0, avg_       , _       , op_avg)
1213 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1214 #undef op_avg
1215 #undef op_avg_no_rnd
1216 #undef op_put
1217 #undef op_put_no_rnd
1218
1219 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1220 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1221 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1222 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1223 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1224 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1225
1226 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1227     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1228     int i;
1229
1230     for(i=0; i<h; i++){
1231         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1232         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1233         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1234         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1235         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1236         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1237         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1238         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1239         dst+=dstStride;
1240         src+=srcStride;
1241     }
1242 }
1243
1244 #if CONFIG_RV40_DECODER
1245 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1246     put_pixels16_xy2_8_c(dst, src, stride, 16);
1247 }
1248 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1249     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1250 }
1251 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1252     put_pixels8_xy2_8_c(dst, src, stride, 8);
1253 }
1254 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1255     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1256 }
1257 #endif /* CONFIG_RV40_DECODER */
1258
1259 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1260     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1261     int i;
1262
1263     for(i=0; i<w; i++){
1264         const int src_1= src[ -srcStride];
1265         const int src0 = src[0          ];
1266         const int src1 = src[  srcStride];
1267         const int src2 = src[2*srcStride];
1268         const int src3 = src[3*srcStride];
1269         const int src4 = src[4*srcStride];
1270         const int src5 = src[5*srcStride];
1271         const int src6 = src[6*srcStride];
1272         const int src7 = src[7*srcStride];
1273         const int src8 = src[8*srcStride];
1274         const int src9 = src[9*srcStride];
1275         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1276         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1277         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1278         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1279         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1280         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1281         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1282         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1283         src++;
1284         dst++;
1285     }
1286 }
1287
1288 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1289     uint8_t half[64];
1290     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1291     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1292 }
1293
1294 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1295     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1296 }
1297
1298 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1299     uint8_t half[64];
1300     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1301     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1302 }
1303
1304 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1305     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1306 }
1307
1308 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1309     uint8_t halfH[88];
1310     uint8_t halfV[64];
1311     uint8_t halfHV[64];
1312     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1313     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1314     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1315     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1316 }
1317 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1318     uint8_t halfH[88];
1319     uint8_t halfV[64];
1320     uint8_t halfHV[64];
1321     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1322     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1323     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1324     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1325 }
1326 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1327     uint8_t halfH[88];
1328     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1329     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1330 }
1331
1332 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1333     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1334     int x;
1335     const int strength= ff_h263_loop_filter_strength[qscale];
1336
1337     for(x=0; x<8; x++){
1338         int d1, d2, ad1;
1339         int p0= src[x-2*stride];
1340         int p1= src[x-1*stride];
1341         int p2= src[x+0*stride];
1342         int p3= src[x+1*stride];
1343         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1344
1345         if     (d<-2*strength) d1= 0;
1346         else if(d<-  strength) d1=-2*strength - d;
1347         else if(d<   strength) d1= d;
1348         else if(d< 2*strength) d1= 2*strength - d;
1349         else                   d1= 0;
1350
1351         p1 += d1;
1352         p2 -= d1;
1353         if(p1&256) p1= ~(p1>>31);
1354         if(p2&256) p2= ~(p2>>31);
1355
1356         src[x-1*stride] = p1;
1357         src[x+0*stride] = p2;
1358
1359         ad1= FFABS(d1)>>1;
1360
1361         d2= av_clip((p0-p3)/4, -ad1, ad1);
1362
1363         src[x-2*stride] = p0 - d2;
1364         src[x+  stride] = p3 + d2;
1365     }
1366     }
1367 }
1368
1369 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1370     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1371     int y;
1372     const int strength= ff_h263_loop_filter_strength[qscale];
1373
1374     for(y=0; y<8; y++){
1375         int d1, d2, ad1;
1376         int p0= src[y*stride-2];
1377         int p1= src[y*stride-1];
1378         int p2= src[y*stride+0];
1379         int p3= src[y*stride+1];
1380         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1381
1382         if     (d<-2*strength) d1= 0;
1383         else if(d<-  strength) d1=-2*strength - d;
1384         else if(d<   strength) d1= d;
1385         else if(d< 2*strength) d1= 2*strength - d;
1386         else                   d1= 0;
1387
1388         p1 += d1;
1389         p2 -= d1;
1390         if(p1&256) p1= ~(p1>>31);
1391         if(p2&256) p2= ~(p2>>31);
1392
1393         src[y*stride-1] = p1;
1394         src[y*stride+0] = p2;
1395
1396         ad1= FFABS(d1)>>1;
1397
1398         d2= av_clip((p0-p3)/4, -ad1, ad1);
1399
1400         src[y*stride-2] = p0 - d2;
1401         src[y*stride+1] = p3 + d2;
1402     }
1403     }
1404 }
1405
1406 static void h261_loop_filter_c(uint8_t *src, int stride){
1407     int x,y,xy,yz;
1408     int temp[64];
1409
1410     for(x=0; x<8; x++){
1411         temp[x      ] = 4*src[x           ];
1412         temp[x + 7*8] = 4*src[x + 7*stride];
1413     }
1414     for(y=1; y<7; y++){
1415         for(x=0; x<8; x++){
1416             xy = y * stride + x;
1417             yz = y * 8 + x;
1418             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1419         }
1420     }
1421
1422     for(y=0; y<8; y++){
1423         src[  y*stride] = (temp[  y*8] + 2)>>2;
1424         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1425         for(x=1; x<7; x++){
1426             xy = y * stride + x;
1427             yz = y * 8 + x;
1428             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1429         }
1430     }
1431 }
1432
1433 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1434 {
1435     int s, i;
1436
1437     s = 0;
1438     for(i=0;i<h;i++) {
1439         s += abs(pix1[0] - pix2[0]);
1440         s += abs(pix1[1] - pix2[1]);
1441         s += abs(pix1[2] - pix2[2]);
1442         s += abs(pix1[3] - pix2[3]);
1443         s += abs(pix1[4] - pix2[4]);
1444         s += abs(pix1[5] - pix2[5]);
1445         s += abs(pix1[6] - pix2[6]);
1446         s += abs(pix1[7] - pix2[7]);
1447         s += abs(pix1[8] - pix2[8]);
1448         s += abs(pix1[9] - pix2[9]);
1449         s += abs(pix1[10] - pix2[10]);
1450         s += abs(pix1[11] - pix2[11]);
1451         s += abs(pix1[12] - pix2[12]);
1452         s += abs(pix1[13] - pix2[13]);
1453         s += abs(pix1[14] - pix2[14]);
1454         s += abs(pix1[15] - pix2[15]);
1455         pix1 += line_size;
1456         pix2 += line_size;
1457     }
1458     return s;
1459 }
1460
1461 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1462 {
1463     int s, i;
1464
1465     s = 0;
1466     for(i=0;i<h;i++) {
1467         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1468         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1469         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1470         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1471         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1472         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1473         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1474         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1475         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1476         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1477         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1478         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1479         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1480         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1481         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1482         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1483         pix1 += line_size;
1484         pix2 += line_size;
1485     }
1486     return s;
1487 }
1488
1489 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1490 {
1491     int s, i;
1492     uint8_t *pix3 = pix2 + line_size;
1493
1494     s = 0;
1495     for(i=0;i<h;i++) {
1496         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1497         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1498         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1499         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1500         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1501         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1502         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1503         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1504         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1505         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1506         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1507         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1508         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1509         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1510         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1511         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1512         pix1 += line_size;
1513         pix2 += line_size;
1514         pix3 += line_size;
1515     }
1516     return s;
1517 }
1518
1519 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1520 {
1521     int s, i;
1522     uint8_t *pix3 = pix2 + line_size;
1523
1524     s = 0;
1525     for(i=0;i<h;i++) {
1526         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1527         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1528         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1529         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1530         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1531         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1532         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1533         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1534         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1535         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1536         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1537         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1538         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1539         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1540         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1541         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1542         pix1 += line_size;
1543         pix2 += line_size;
1544         pix3 += line_size;
1545     }
1546     return s;
1547 }
1548
1549 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1550 {
1551     int s, i;
1552
1553     s = 0;
1554     for(i=0;i<h;i++) {
1555         s += abs(pix1[0] - pix2[0]);
1556         s += abs(pix1[1] - pix2[1]);
1557         s += abs(pix1[2] - pix2[2]);
1558         s += abs(pix1[3] - pix2[3]);
1559         s += abs(pix1[4] - pix2[4]);
1560         s += abs(pix1[5] - pix2[5]);
1561         s += abs(pix1[6] - pix2[6]);
1562         s += abs(pix1[7] - pix2[7]);
1563         pix1 += line_size;
1564         pix2 += line_size;
1565     }
1566     return s;
1567 }
1568
1569 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1570 {
1571     int s, i;
1572
1573     s = 0;
1574     for(i=0;i<h;i++) {
1575         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1576         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1577         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1578         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1579         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1580         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1581         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1582         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1583         pix1 += line_size;
1584         pix2 += line_size;
1585     }
1586     return s;
1587 }
1588
1589 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1590 {
1591     int s, i;
1592     uint8_t *pix3 = pix2 + line_size;
1593
1594     s = 0;
1595     for(i=0;i<h;i++) {
1596         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1597         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1598         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1599         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1600         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1601         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1602         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1603         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1604         pix1 += line_size;
1605         pix2 += line_size;
1606         pix3 += line_size;
1607     }
1608     return s;
1609 }
1610
1611 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1612 {
1613     int s, i;
1614     uint8_t *pix3 = pix2 + line_size;
1615
1616     s = 0;
1617     for(i=0;i<h;i++) {
1618         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1619         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1620         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1621         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1622         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1623         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1624         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1625         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1626         pix1 += line_size;
1627         pix2 += line_size;
1628         pix3 += line_size;
1629     }
1630     return s;
1631 }
1632
1633 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1634     MpegEncContext *c = v;
1635     int score1=0;
1636     int score2=0;
1637     int x,y;
1638
1639     for(y=0; y<h; y++){
1640         for(x=0; x<16; x++){
1641             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1642         }
1643         if(y+1<h){
1644             for(x=0; x<15; x++){
1645                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1646                              - s1[x+1] + s1[x+1+stride])
1647                         -FFABS(  s2[x  ] - s2[x  +stride]
1648                              - s2[x+1] + s2[x+1+stride]);
1649             }
1650         }
1651         s1+= stride;
1652         s2+= stride;
1653     }
1654
1655     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1656     else  return score1 + FFABS(score2)*8;
1657 }
1658
1659 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1660     MpegEncContext *c = v;
1661     int score1=0;
1662     int score2=0;
1663     int x,y;
1664
1665     for(y=0; y<h; y++){
1666         for(x=0; x<8; x++){
1667             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1668         }
1669         if(y+1<h){
1670             for(x=0; x<7; x++){
1671                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1672                              - s1[x+1] + s1[x+1+stride])
1673                         -FFABS(  s2[x  ] - s2[x  +stride]
1674                              - s2[x+1] + s2[x+1+stride]);
1675             }
1676         }
1677         s1+= stride;
1678         s2+= stride;
1679     }
1680
1681     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1682     else  return score1 + FFABS(score2)*8;
1683 }
1684
1685 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1686     int i;
1687     unsigned int sum=0;
1688
1689     for(i=0; i<8*8; i++){
1690         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1691         int w= weight[i];
1692         b>>= RECON_SHIFT;
1693         assert(-512<b && b<512);
1694
1695         sum += (w*b)*(w*b)>>4;
1696     }
1697     return sum>>2;
1698 }
1699
1700 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1701     int i;
1702
1703     for(i=0; i<8*8; i++){
1704         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1705     }
1706 }
1707
1708 /**
1709  * Permute an 8x8 block.
1710  * @param block the block which will be permuted according to the given permutation vector
1711  * @param permutation the permutation vector
1712  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1713  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1714  *                  (inverse) permutated to scantable order!
1715  */
1716 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1717 {
1718     int i;
1719     DCTELEM temp[64];
1720
1721     if(last<=0) return;
1722     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1723
1724     for(i=0; i<=last; i++){
1725         const int j= scantable[i];
1726         temp[j]= block[j];
1727         block[j]=0;
1728     }
1729
1730     for(i=0; i<=last; i++){
1731         const int j= scantable[i];
1732         const int perm_j= permutation[j];
1733         block[perm_j]= temp[j];
1734     }
1735 }
1736
1737 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1738     return 0;
1739 }
1740
1741 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1742     int i;
1743
1744     memset(cmp, 0, sizeof(void*)*6);
1745
1746     for(i=0; i<6; i++){
1747         switch(type&0xFF){
1748         case FF_CMP_SAD:
1749             cmp[i]= c->sad[i];
1750             break;
1751         case FF_CMP_SATD:
1752             cmp[i]= c->hadamard8_diff[i];
1753             break;
1754         case FF_CMP_SSE:
1755             cmp[i]= c->sse[i];
1756             break;
1757         case FF_CMP_DCT:
1758             cmp[i]= c->dct_sad[i];
1759             break;
1760         case FF_CMP_DCT264:
1761             cmp[i]= c->dct264_sad[i];
1762             break;
1763         case FF_CMP_DCTMAX:
1764             cmp[i]= c->dct_max[i];
1765             break;
1766         case FF_CMP_PSNR:
1767             cmp[i]= c->quant_psnr[i];
1768             break;
1769         case FF_CMP_BIT:
1770             cmp[i]= c->bit[i];
1771             break;
1772         case FF_CMP_RD:
1773             cmp[i]= c->rd[i];
1774             break;
1775         case FF_CMP_VSAD:
1776             cmp[i]= c->vsad[i];
1777             break;
1778         case FF_CMP_VSSE:
1779             cmp[i]= c->vsse[i];
1780             break;
1781         case FF_CMP_ZERO:
1782             cmp[i]= zero_cmp;
1783             break;
1784         case FF_CMP_NSSE:
1785             cmp[i]= c->nsse[i];
1786             break;
1787         default:
1788             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1789         }
1790     }
1791 }
1792
1793 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1794     long i;
1795     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1796         long a = *(long*)(src+i);
1797         long b = *(long*)(dst+i);
1798         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1799     }
1800     for(; i<w; i++)
1801         dst[i+0] += src[i+0];
1802 }
1803
1804 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1805     long i;
1806 #if !HAVE_FAST_UNALIGNED
1807     if((long)src2 & (sizeof(long)-1)){
1808         for(i=0; i+7<w; i+=8){
1809             dst[i+0] = src1[i+0]-src2[i+0];
1810             dst[i+1] = src1[i+1]-src2[i+1];
1811             dst[i+2] = src1[i+2]-src2[i+2];
1812             dst[i+3] = src1[i+3]-src2[i+3];
1813             dst[i+4] = src1[i+4]-src2[i+4];
1814             dst[i+5] = src1[i+5]-src2[i+5];
1815             dst[i+6] = src1[i+6]-src2[i+6];
1816             dst[i+7] = src1[i+7]-src2[i+7];
1817         }
1818     }else
1819 #endif
1820     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1821         long a = *(long*)(src1+i);
1822         long b = *(long*)(src2+i);
1823         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1824     }
1825     for(; i<w; i++)
1826         dst[i+0] = src1[i+0]-src2[i+0];
1827 }
1828
1829 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1830     int i;
1831     uint8_t l, lt;
1832
1833     l= *left;
1834     lt= *left_top;
1835
1836     for(i=0; i<w; i++){
1837         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1838         lt= src1[i];
1839         dst[i]= l;
1840     }
1841
1842     *left= l;
1843     *left_top= lt;
1844 }
1845
1846 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1847     int i;
1848     uint8_t l, lt;
1849
1850     l= *left;
1851     lt= *left_top;
1852
1853     for(i=0; i<w; i++){
1854         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1855         lt= src1[i];
1856         l= src2[i];
1857         dst[i]= l - pred;
1858     }
1859
1860     *left= l;
1861     *left_top= lt;
1862 }
1863
1864 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1865     int i;
1866
1867     for(i=0; i<w-1; i++){
1868         acc+= src[i];
1869         dst[i]= acc;
1870         i++;
1871         acc+= src[i];
1872         dst[i]= acc;
1873     }
1874
1875     for(; i<w; i++){
1876         acc+= src[i];
1877         dst[i]= acc;
1878     }
1879
1880     return acc;
1881 }
1882
1883 #if HAVE_BIGENDIAN
1884 #define B 3
1885 #define G 2
1886 #define R 1
1887 #define A 0
1888 #else
1889 #define B 0
1890 #define G 1
1891 #define R 2
1892 #define A 3
1893 #endif
1894 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1895     int i;
1896     int r,g,b,a;
1897     r= *red;
1898     g= *green;
1899     b= *blue;
1900     a= *alpha;
1901
1902     for(i=0; i<w; i++){
1903         b+= src[4*i+B];
1904         g+= src[4*i+G];
1905         r+= src[4*i+R];
1906         a+= src[4*i+A];
1907
1908         dst[4*i+B]= b;
1909         dst[4*i+G]= g;
1910         dst[4*i+R]= r;
1911         dst[4*i+A]= a;
1912     }
1913
1914     *red= r;
1915     *green= g;
1916     *blue= b;
1917     *alpha= a;
1918 }
1919 #undef B
1920 #undef G
1921 #undef R
1922 #undef A
1923
1924 #define BUTTERFLY2(o1,o2,i1,i2) \
1925 o1= (i1)+(i2);\
1926 o2= (i1)-(i2);
1927
1928 #define BUTTERFLY1(x,y) \
1929 {\
1930     int a,b;\
1931     a= x;\
1932     b= y;\
1933     x= a+b;\
1934     y= a-b;\
1935 }
1936
1937 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1938
1939 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1940     int i;
1941     int temp[64];
1942     int sum=0;
1943
1944     assert(h==8);
1945
1946     for(i=0; i<8; i++){
1947         //FIXME try pointer walks
1948         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1949         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1950         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1951         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1952
1953         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1954         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1955         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1956         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1957
1958         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1959         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1960         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1961         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1962     }
1963
1964     for(i=0; i<8; i++){
1965         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1966         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1967         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1968         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1969
1970         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1971         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1972         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1973         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1974
1975         sum +=
1976              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1977             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1978             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1979             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1980     }
1981     return sum;
1982 }
1983
1984 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1985     int i;
1986     int temp[64];
1987     int sum=0;
1988
1989     assert(h==8);
1990
1991     for(i=0; i<8; i++){
1992         //FIXME try pointer walks
1993         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
1994         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
1995         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
1996         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
1997
1998         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1999         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2000         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2001         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2002
2003         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2004         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2005         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2006         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2007     }
2008
2009     for(i=0; i<8; i++){
2010         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2011         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2012         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2013         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2014
2015         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2016         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2017         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2018         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2019
2020         sum +=
2021              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2022             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2023             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2024             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2025     }
2026
2027     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2028
2029     return sum;
2030 }
2031
2032 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2033     MpegEncContext * const s= (MpegEncContext *)c;
2034     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2035
2036     assert(h==8);
2037
2038     s->dsp.diff_pixels(temp, src1, src2, stride);
2039     s->dsp.fdct(temp);
2040     return s->dsp.sum_abs_dctelem(temp);
2041 }
2042
2043 #if CONFIG_GPL
2044 #define DCT8_1D {\
2045     const int s07 = SRC(0) + SRC(7);\
2046     const int s16 = SRC(1) + SRC(6);\
2047     const int s25 = SRC(2) + SRC(5);\
2048     const int s34 = SRC(3) + SRC(4);\
2049     const int a0 = s07 + s34;\
2050     const int a1 = s16 + s25;\
2051     const int a2 = s07 - s34;\
2052     const int a3 = s16 - s25;\
2053     const int d07 = SRC(0) - SRC(7);\
2054     const int d16 = SRC(1) - SRC(6);\
2055     const int d25 = SRC(2) - SRC(5);\
2056     const int d34 = SRC(3) - SRC(4);\
2057     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2058     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2059     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2060     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2061     DST(0,  a0 + a1     ) ;\
2062     DST(1,  a4 + (a7>>2)) ;\
2063     DST(2,  a2 + (a3>>1)) ;\
2064     DST(3,  a5 + (a6>>2)) ;\
2065     DST(4,  a0 - a1     ) ;\
2066     DST(5,  a6 - (a5>>2)) ;\
2067     DST(6, (a2>>1) - a3 ) ;\
2068     DST(7, (a4>>2) - a7 ) ;\
2069 }
2070
2071 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2072     MpegEncContext * const s= (MpegEncContext *)c;
2073     DCTELEM dct[8][8];
2074     int i;
2075     int sum=0;
2076
2077     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2078
2079 #define SRC(x) dct[i][x]
2080 #define DST(x,v) dct[i][x]= v
2081     for( i = 0; i < 8; i++ )
2082         DCT8_1D
2083 #undef SRC
2084 #undef DST
2085
2086 #define SRC(x) dct[x][i]
2087 #define DST(x,v) sum += FFABS(v)
2088     for( i = 0; i < 8; i++ )
2089         DCT8_1D
2090 #undef SRC
2091 #undef DST
2092     return sum;
2093 }
2094 #endif
2095
2096 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2097     MpegEncContext * const s= (MpegEncContext *)c;
2098     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2099     int sum=0, i;
2100
2101     assert(h==8);
2102
2103     s->dsp.diff_pixels(temp, src1, src2, stride);
2104     s->dsp.fdct(temp);
2105
2106     for(i=0; i<64; i++)
2107         sum= FFMAX(sum, FFABS(temp[i]));
2108
2109     return sum;
2110 }
2111
2112 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2113     MpegEncContext * const s= (MpegEncContext *)c;
2114     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2115     DCTELEM * const bak = temp+64;
2116     int sum=0, i;
2117
2118     assert(h==8);
2119     s->mb_intra=0;
2120
2121     s->dsp.diff_pixels(temp, src1, src2, stride);
2122
2123     memcpy(bak, temp, 64*sizeof(DCTELEM));
2124
2125     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2126     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2127     ff_simple_idct_8(temp); //FIXME
2128
2129     for(i=0; i<64; i++)
2130         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2131
2132     return sum;
2133 }
2134
2135 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2136     MpegEncContext * const s= (MpegEncContext *)c;
2137     const uint8_t *scantable= s->intra_scantable.permutated;
2138     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2139     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2140     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2141     int i, last, run, bits, level, distortion, start_i;
2142     const int esc_length= s->ac_esc_length;
2143     uint8_t * length;
2144     uint8_t * last_length;
2145
2146     assert(h==8);
2147
2148     copy_block8(lsrc1, src1, 8, stride, 8);
2149     copy_block8(lsrc2, src2, 8, stride, 8);
2150
2151     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2152
2153     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2154
2155     bits=0;
2156
2157     if (s->mb_intra) {
2158         start_i = 1;
2159         length     = s->intra_ac_vlc_length;
2160         last_length= s->intra_ac_vlc_last_length;
2161         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2162     } else {
2163         start_i = 0;
2164         length     = s->inter_ac_vlc_length;
2165         last_length= s->inter_ac_vlc_last_length;
2166     }
2167
2168     if(last>=start_i){
2169         run=0;
2170         for(i=start_i; i<last; i++){
2171             int j= scantable[i];
2172             level= temp[j];
2173
2174             if(level){
2175                 level+=64;
2176                 if((level&(~127)) == 0){
2177                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2178                 }else
2179                     bits+= esc_length;
2180                 run=0;
2181             }else
2182                 run++;
2183         }
2184         i= scantable[last];
2185
2186         level= temp[i] + 64;
2187
2188         assert(level - 64);
2189
2190         if((level&(~127)) == 0){
2191             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2192         }else
2193             bits+= esc_length;
2194
2195     }
2196
2197     if(last>=0){
2198         if(s->mb_intra)
2199             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2200         else
2201             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2202     }
2203
2204     s->dsp.idct_add(lsrc2, 8, temp);
2205
2206     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2207
2208     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2209 }
2210
2211 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2212     MpegEncContext * const s= (MpegEncContext *)c;
2213     const uint8_t *scantable= s->intra_scantable.permutated;
2214     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2215     int i, last, run, bits, level, start_i;
2216     const int esc_length= s->ac_esc_length;
2217     uint8_t * length;
2218     uint8_t * last_length;
2219
2220     assert(h==8);
2221
2222     s->dsp.diff_pixels(temp, src1, src2, stride);
2223
2224     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2225
2226     bits=0;
2227
2228     if (s->mb_intra) {
2229         start_i = 1;
2230         length     = s->intra_ac_vlc_length;
2231         last_length= s->intra_ac_vlc_last_length;
2232         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2233     } else {
2234         start_i = 0;
2235         length     = s->inter_ac_vlc_length;
2236         last_length= s->inter_ac_vlc_last_length;
2237     }
2238
2239     if(last>=start_i){
2240         run=0;
2241         for(i=start_i; i<last; i++){
2242             int j= scantable[i];
2243             level= temp[j];
2244
2245             if(level){
2246                 level+=64;
2247                 if((level&(~127)) == 0){
2248                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2249                 }else
2250                     bits+= esc_length;
2251                 run=0;
2252             }else
2253                 run++;
2254         }
2255         i= scantable[last];
2256
2257         level= temp[i] + 64;
2258
2259         assert(level - 64);
2260
2261         if((level&(~127)) == 0){
2262             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2263         }else
2264             bits+= esc_length;
2265     }
2266
2267     return bits;
2268 }
2269
2270 #define VSAD_INTRA(size) \
2271 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2272     int score=0;                                                                                            \
2273     int x,y;                                                                                                \
2274                                                                                                             \
2275     for(y=1; y<h; y++){                                                                                     \
2276         for(x=0; x<size; x+=4){                                                                             \
2277             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2278                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2279         }                                                                                                   \
2280         s+= stride;                                                                                         \
2281     }                                                                                                       \
2282                                                                                                             \
2283     return score;                                                                                           \
2284 }
2285 VSAD_INTRA(8)
2286 VSAD_INTRA(16)
2287
2288 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2289     int score=0;
2290     int x,y;
2291
2292     for(y=1; y<h; y++){
2293         for(x=0; x<16; x++){
2294             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2295         }
2296         s1+= stride;
2297         s2+= stride;
2298     }
2299
2300     return score;
2301 }
2302
2303 #define SQ(a) ((a)*(a))
2304 #define VSSE_INTRA(size) \
2305 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2306     int score=0;                                                                                            \
2307     int x,y;                                                                                                \
2308                                                                                                             \
2309     for(y=1; y<h; y++){                                                                                     \
2310         for(x=0; x<size; x+=4){                                                                               \
2311             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2312                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2313         }                                                                                                   \
2314         s+= stride;                                                                                         \
2315     }                                                                                                       \
2316                                                                                                             \
2317     return score;                                                                                           \
2318 }
2319 VSSE_INTRA(8)
2320 VSSE_INTRA(16)
2321
2322 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2323     int score=0;
2324     int x,y;
2325
2326     for(y=1; y<h; y++){
2327         for(x=0; x<16; x++){
2328             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2329         }
2330         s1+= stride;
2331         s2+= stride;
2332     }
2333
2334     return score;
2335 }
2336
2337 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2338                                int size){
2339     int score=0;
2340     int i;
2341     for(i=0; i<size; i++)
2342         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2343     return score;
2344 }
2345
2346 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2347 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2348 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2349 #if CONFIG_GPL
2350 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2351 #endif
2352 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2353 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2354 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2355 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2356
2357 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2358     int i;
2359     src1 += len-1;
2360     for(i=0; i<len; i++)
2361         dst[i] = src0[i] * src1[-i];
2362 }
2363
2364 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2365     int i;
2366     for(i=0; i<len; i++)
2367         dst[i] = src0[i] * src1[i] + src2[i];
2368 }
2369
2370 static void vector_fmul_window_c(float *dst, const float *src0,
2371                                  const float *src1, const float *win, int len)
2372 {
2373     int i,j;
2374     dst += len;
2375     win += len;
2376     src0+= len;
2377     for(i=-len, j=len-1; i<0; i++, j--) {
2378         float s0 = src0[i];
2379         float s1 = src1[j];
2380         float wi = win[i];
2381         float wj = win[j];
2382         dst[i] = s0*wj - s1*wi;
2383         dst[j] = s0*wi + s1*wj;
2384     }
2385 }
2386
2387 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2388                                 int len)
2389 {
2390     int i;
2391     for (i = 0; i < len; i++) {
2392         float t = v1[i] - v2[i];
2393         v1[i] += v2[i];
2394         v2[i] = t;
2395     }
2396 }
2397
2398 static void butterflies_float_interleave_c(float *dst, const float *src0,
2399                                            const float *src1, int len)
2400 {
2401     int i;
2402     for (i = 0; i < len; i++) {
2403         float f1 = src0[i];
2404         float f2 = src1[i];
2405         dst[2*i    ] = f1 + f2;
2406         dst[2*i + 1] = f1 - f2;
2407     }
2408 }
2409
2410 float ff_scalarproduct_float_c(const float *v1, const float *v2, int len)
2411 {
2412     float p = 0.0;
2413     int i;
2414
2415     for (i = 0; i < len; i++)
2416         p += v1[i] * v2[i];
2417
2418     return p;
2419 }
2420
2421 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2422                    uint32_t maxi, uint32_t maxisign)
2423 {
2424
2425     if(a > mini) return mini;
2426     else if((a^(1U<<31)) > maxisign) return maxi;
2427     else return a;
2428 }
2429
2430 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2431     int i;
2432     uint32_t mini = *(uint32_t*)min;
2433     uint32_t maxi = *(uint32_t*)max;
2434     uint32_t maxisign = maxi ^ (1U<<31);
2435     uint32_t *dsti = (uint32_t*)dst;
2436     const uint32_t *srci = (const uint32_t*)src;
2437     for(i=0; i<len; i+=8) {
2438         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2439         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2440         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2441         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2442         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2443         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2444         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2445         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2446     }
2447 }
2448 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2449     int i;
2450     if(min < 0 && max > 0) {
2451         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2452     } else {
2453         for(i=0; i < len; i+=8) {
2454             dst[i    ] = av_clipf(src[i    ], min, max);
2455             dst[i + 1] = av_clipf(src[i + 1], min, max);
2456             dst[i + 2] = av_clipf(src[i + 2], min, max);
2457             dst[i + 3] = av_clipf(src[i + 3], min, max);
2458             dst[i + 4] = av_clipf(src[i + 4], min, max);
2459             dst[i + 5] = av_clipf(src[i + 5], min, max);
2460             dst[i + 6] = av_clipf(src[i + 6], min, max);
2461             dst[i + 7] = av_clipf(src[i + 7], min, max);
2462         }
2463     }
2464 }
2465
2466 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2467 {
2468     int res = 0;
2469
2470     while (order--)
2471         res += *v1++ * *v2++;
2472
2473     return res;
2474 }
2475
2476 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2477 {
2478     int res = 0;
2479     while (order--) {
2480         res   += *v1 * *v2++;
2481         *v1++ += mul * *v3++;
2482     }
2483     return res;
2484 }
2485
2486 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2487                                  const int16_t *window, unsigned int len)
2488 {
2489     int i;
2490     int len2 = len >> 1;
2491
2492     for (i = 0; i < len2; i++) {
2493         int16_t w       = window[i];
2494         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2495         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2496     }
2497 }
2498
2499 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2500                                 int32_t max, unsigned int len)
2501 {
2502     do {
2503         *dst++ = av_clip(*src++, min, max);
2504         *dst++ = av_clip(*src++, min, max);
2505         *dst++ = av_clip(*src++, min, max);
2506         *dst++ = av_clip(*src++, min, max);
2507         *dst++ = av_clip(*src++, min, max);
2508         *dst++ = av_clip(*src++, min, max);
2509         *dst++ = av_clip(*src++, min, max);
2510         *dst++ = av_clip(*src++, min, max);
2511         len -= 8;
2512     } while (len > 0);
2513 }
2514
2515 #define W0 2048
2516 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2517 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2518 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2519 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2520 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2521 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2522 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2523
2524 static void wmv2_idct_row(short * b)
2525 {
2526     int s1,s2;
2527     int a0,a1,a2,a3,a4,a5,a6,a7;
2528     /*step 1*/
2529     a1 = W1*b[1]+W7*b[7];
2530     a7 = W7*b[1]-W1*b[7];
2531     a5 = W5*b[5]+W3*b[3];
2532     a3 = W3*b[5]-W5*b[3];
2533     a2 = W2*b[2]+W6*b[6];
2534     a6 = W6*b[2]-W2*b[6];
2535     a0 = W0*b[0]+W0*b[4];
2536     a4 = W0*b[0]-W0*b[4];
2537     /*step 2*/
2538     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2539     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2540     /*step 3*/
2541     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2542     b[1] = (a4+a6 +s1   + (1<<7))>>8;
2543     b[2] = (a4-a6 +s2   + (1<<7))>>8;
2544     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2545     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2546     b[5] = (a4-a6 -s2   + (1<<7))>>8;
2547     b[6] = (a4+a6 -s1   + (1<<7))>>8;
2548     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2549 }
2550 static void wmv2_idct_col(short * b)
2551 {
2552     int s1,s2;
2553     int a0,a1,a2,a3,a4,a5,a6,a7;
2554     /*step 1, with extended precision*/
2555     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2556     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2557     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2558     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2559     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2560     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2561     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2562     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2563     /*step 2*/
2564     s1 = (181*(a1-a5+a7-a3)+128)>>8;
2565     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2566     /*step 3*/
2567     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2568     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2569     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2570     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2571
2572     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2573     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2574     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2575     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2576 }
2577 void ff_wmv2_idct_c(short * block){
2578     int i;
2579
2580     for(i=0;i<64;i+=8){
2581         wmv2_idct_row(block+i);
2582     }
2583     for(i=0;i<8;i++){
2584         wmv2_idct_col(block+i);
2585     }
2586 }
2587 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2588  converted */
2589 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2590 {
2591     ff_wmv2_idct_c(block);
2592     put_pixels_clamped_c(block, dest, line_size);
2593 }
2594 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2595 {
2596     ff_wmv2_idct_c(block);
2597     add_pixels_clamped_c(block, dest, line_size);
2598 }
2599 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2600 {
2601     ff_j_rev_dct (block);
2602     put_pixels_clamped_c(block, dest, line_size);
2603 }
2604 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2605 {
2606     ff_j_rev_dct (block);
2607     add_pixels_clamped_c(block, dest, line_size);
2608 }
2609
2610 /* init static data */
2611 av_cold void ff_dsputil_static_init(void)
2612 {
2613     int i;
2614
2615     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2616     for(i=0;i<MAX_NEG_CROP;i++) {
2617         ff_cropTbl[i] = 0;
2618         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2619     }
2620
2621     for(i=0;i<512;i++) {
2622         ff_squareTbl[i] = (i - 256) * (i - 256);
2623     }
2624
2625     for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2626 }
2627
2628 int ff_check_alignment(void){
2629     static int did_fail=0;
2630     LOCAL_ALIGNED_16(int, aligned, [4]);
2631
2632     if((intptr_t)aligned & 15){
2633         if(!did_fail){
2634 #if HAVE_MMX || HAVE_ALTIVEC
2635             av_log(NULL, AV_LOG_ERROR,
2636                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2637                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2638                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2639                 "Do not report crashes to Libav developers.\n");
2640 #endif
2641             did_fail=1;
2642         }
2643         return -1;
2644     }
2645     return 0;
2646 }
2647
2648 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2649 {
2650     int i, j;
2651
2652     ff_check_alignment();
2653
2654 #if CONFIG_ENCODERS
2655     if (avctx->bits_per_raw_sample == 10) {
2656         c->fdct    = ff_jpeg_fdct_islow_10;
2657         c->fdct248 = ff_fdct248_islow_10;
2658     } else {
2659         if(avctx->dct_algo==FF_DCT_FASTINT) {
2660             c->fdct    = ff_fdct_ifast;
2661             c->fdct248 = ff_fdct_ifast248;
2662         }
2663         else if(avctx->dct_algo==FF_DCT_FAAN) {
2664             c->fdct    = ff_faandct;
2665             c->fdct248 = ff_faandct248;
2666         }
2667         else {
2668             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2669             c->fdct248 = ff_fdct248_islow_8;
2670         }
2671     }
2672 #endif //CONFIG_ENCODERS
2673
2674     if (avctx->bits_per_raw_sample == 10) {
2675         c->idct_put              = ff_simple_idct_put_10;
2676         c->idct_add              = ff_simple_idct_add_10;
2677         c->idct                  = ff_simple_idct_10;
2678         c->idct_permutation_type = FF_NO_IDCT_PERM;
2679     } else {
2680         if(avctx->idct_algo==FF_IDCT_INT){
2681             c->idct_put= ff_jref_idct_put;
2682             c->idct_add= ff_jref_idct_add;
2683             c->idct    = ff_j_rev_dct;
2684             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2685         }else if(avctx->idct_algo==FF_IDCT_WMV2){
2686             c->idct_put= ff_wmv2_idct_put_c;
2687             c->idct_add= ff_wmv2_idct_add_c;
2688             c->idct    = ff_wmv2_idct_c;
2689             c->idct_permutation_type= FF_NO_IDCT_PERM;
2690         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2691             c->idct_put= ff_faanidct_put;
2692             c->idct_add= ff_faanidct_add;
2693             c->idct    = ff_faanidct;
2694             c->idct_permutation_type= FF_NO_IDCT_PERM;
2695         }else{ //accurate/default
2696             c->idct_put = ff_simple_idct_put_8;
2697             c->idct_add = ff_simple_idct_add_8;
2698             c->idct     = ff_simple_idct_8;
2699             c->idct_permutation_type= FF_NO_IDCT_PERM;
2700         }
2701     }
2702
2703     c->diff_pixels = diff_pixels_c;
2704     c->put_pixels_clamped = put_pixels_clamped_c;
2705     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2706     c->add_pixels_clamped = add_pixels_clamped_c;
2707     c->sum_abs_dctelem = sum_abs_dctelem_c;
2708     c->gmc1 = gmc1_c;
2709     c->gmc = ff_gmc_c;
2710     c->pix_sum = pix_sum_c;
2711     c->pix_norm1 = pix_norm1_c;
2712
2713     c->fill_block_tab[0] = fill_block16_c;
2714     c->fill_block_tab[1] = fill_block8_c;
2715
2716     /* TODO [0] 16  [1] 8 */
2717     c->pix_abs[0][0] = pix_abs16_c;
2718     c->pix_abs[0][1] = pix_abs16_x2_c;
2719     c->pix_abs[0][2] = pix_abs16_y2_c;
2720     c->pix_abs[0][3] = pix_abs16_xy2_c;
2721     c->pix_abs[1][0] = pix_abs8_c;
2722     c->pix_abs[1][1] = pix_abs8_x2_c;
2723     c->pix_abs[1][2] = pix_abs8_y2_c;
2724     c->pix_abs[1][3] = pix_abs8_xy2_c;
2725
2726     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2727     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2728     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2729     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2730     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2731     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2732     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2733     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2734     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2735
2736     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2737     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2738     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2739     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2740     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2741     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2742     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2743     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2744     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2745
2746 #define dspfunc(PFX, IDX, NUM) \
2747     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2748     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2749     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2750     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2751     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2752     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2753     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2754     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2755     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2756     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2757     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2758     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2759     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2760     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2761     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2762     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2763
2764     dspfunc(put_qpel, 0, 16);
2765     dspfunc(put_no_rnd_qpel, 0, 16);
2766
2767     dspfunc(avg_qpel, 0, 16);
2768     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2769
2770     dspfunc(put_qpel, 1, 8);
2771     dspfunc(put_no_rnd_qpel, 1, 8);
2772
2773     dspfunc(avg_qpel, 1, 8);
2774     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2775
2776 #undef dspfunc
2777
2778     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2779     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2780     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2781     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2782     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2783     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2784     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2785     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2786
2787 #define SET_CMP_FUNC(name) \
2788     c->name[0]= name ## 16_c;\
2789     c->name[1]= name ## 8x8_c;
2790
2791     SET_CMP_FUNC(hadamard8_diff)
2792     c->hadamard8_diff[4]= hadamard8_intra16_c;
2793     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2794     SET_CMP_FUNC(dct_sad)
2795     SET_CMP_FUNC(dct_max)
2796 #if CONFIG_GPL
2797     SET_CMP_FUNC(dct264_sad)
2798 #endif
2799     c->sad[0]= pix_abs16_c;
2800     c->sad[1]= pix_abs8_c;
2801     c->sse[0]= sse16_c;
2802     c->sse[1]= sse8_c;
2803     c->sse[2]= sse4_c;
2804     SET_CMP_FUNC(quant_psnr)
2805     SET_CMP_FUNC(rd)
2806     SET_CMP_FUNC(bit)
2807     c->vsad[0]= vsad16_c;
2808     c->vsad[4]= vsad_intra16_c;
2809     c->vsad[5]= vsad_intra8_c;
2810     c->vsse[0]= vsse16_c;
2811     c->vsse[4]= vsse_intra16_c;
2812     c->vsse[5]= vsse_intra8_c;
2813     c->nsse[0]= nsse16_c;
2814     c->nsse[1]= nsse8_c;
2815
2816     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2817
2818     c->add_bytes= add_bytes_c;
2819     c->diff_bytes= diff_bytes_c;
2820     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2821     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2822     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
2823     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2824     c->bswap_buf= bswap_buf;
2825     c->bswap16_buf = bswap16_buf;
2826
2827     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2828         c->h263_h_loop_filter= h263_h_loop_filter_c;
2829         c->h263_v_loop_filter= h263_v_loop_filter_c;
2830     }
2831
2832     c->h261_loop_filter= h261_loop_filter_c;
2833
2834     c->try_8x8basis= try_8x8basis_c;
2835     c->add_8x8basis= add_8x8basis_c;
2836
2837 #if CONFIG_VORBIS_DECODER
2838     c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling;
2839 #endif
2840     c->vector_fmul_reverse = vector_fmul_reverse_c;
2841     c->vector_fmul_add = vector_fmul_add_c;
2842     c->vector_fmul_window = vector_fmul_window_c;
2843     c->vector_clipf = vector_clipf_c;
2844     c->scalarproduct_int16 = scalarproduct_int16_c;
2845     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2846     c->apply_window_int16 = apply_window_int16_c;
2847     c->vector_clip_int32 = vector_clip_int32_c;
2848     c->scalarproduct_float = ff_scalarproduct_float_c;
2849     c->butterflies_float = butterflies_float_c;
2850     c->butterflies_float_interleave = butterflies_float_interleave_c;
2851
2852     c->shrink[0]= av_image_copy_plane;
2853     c->shrink[1]= ff_shrink22;
2854     c->shrink[2]= ff_shrink44;
2855     c->shrink[3]= ff_shrink88;
2856
2857     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
2858     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
2859
2860 #undef FUNC
2861 #undef FUNCC
2862 #define FUNC(f, depth) f ## _ ## depth
2863 #define FUNCC(f, depth) f ## _ ## depth ## _c
2864
2865 #define dspfunc1(PFX, IDX, NUM, depth)\
2866     c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
2867     c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
2868     c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
2869     c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
2870
2871 #define dspfunc2(PFX, IDX, NUM, depth)\
2872     c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
2873     c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
2874     c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
2875     c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
2876     c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
2877     c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
2878     c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
2879     c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
2880     c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
2881     c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
2882     c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
2883     c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
2884     c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
2885     c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
2886     c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
2887     c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
2888
2889
2890 #define BIT_DEPTH_FUNCS(depth, dct)\
2891     c->get_pixels                    = FUNCC(get_pixels   ## dct   , depth);\
2892     c->draw_edges                    = FUNCC(draw_edges            , depth);\
2893     c->clear_block                   = FUNCC(clear_block  ## dct   , depth);\
2894     c->clear_blocks                  = FUNCC(clear_blocks ## dct   , depth);\
2895     c->add_pixels8                   = FUNCC(add_pixels8  ## dct   , depth);\
2896     c->add_pixels4                   = FUNCC(add_pixels4  ## dct   , depth);\
2897     c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
2898     c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
2899 \
2900     c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
2901     c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
2902     c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
2903     c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
2904     c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
2905     c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
2906 \
2907     dspfunc1(put       , 0, 16, depth);\
2908     dspfunc1(put       , 1,  8, depth);\
2909     dspfunc1(put       , 2,  4, depth);\
2910     dspfunc1(put       , 3,  2, depth);\
2911     dspfunc1(put_no_rnd, 0, 16, depth);\
2912     dspfunc1(put_no_rnd, 1,  8, depth);\
2913     dspfunc1(avg       , 0, 16, depth);\
2914     dspfunc1(avg       , 1,  8, depth);\
2915     dspfunc1(avg       , 2,  4, depth);\
2916     dspfunc1(avg       , 3,  2, depth);\
2917     dspfunc1(avg_no_rnd, 0, 16, depth);\
2918     dspfunc1(avg_no_rnd, 1,  8, depth);\
2919 \
2920     dspfunc2(put_h264_qpel, 0, 16, depth);\
2921     dspfunc2(put_h264_qpel, 1,  8, depth);\
2922     dspfunc2(put_h264_qpel, 2,  4, depth);\
2923     dspfunc2(put_h264_qpel, 3,  2, depth);\
2924     dspfunc2(avg_h264_qpel, 0, 16, depth);\
2925     dspfunc2(avg_h264_qpel, 1,  8, depth);\
2926     dspfunc2(avg_h264_qpel, 2,  4, depth);
2927
2928     switch (avctx->bits_per_raw_sample) {
2929     case 9:
2930         if (c->dct_bits == 32) {
2931             BIT_DEPTH_FUNCS(9, _32);
2932         } else {
2933             BIT_DEPTH_FUNCS(9, _16);
2934         }
2935         break;
2936     case 10:
2937         if (c->dct_bits == 32) {
2938             BIT_DEPTH_FUNCS(10, _32);
2939         } else {
2940             BIT_DEPTH_FUNCS(10, _16);
2941         }
2942         break;
2943     default:
2944         BIT_DEPTH_FUNCS(8, _16);
2945         break;
2946     }
2947
2948
2949     if (HAVE_MMX)        ff_dsputil_init_mmx   (c, avctx);
2950     if (ARCH_ARM)        ff_dsputil_init_arm   (c, avctx);
2951     if (HAVE_VIS)        ff_dsputil_init_vis   (c, avctx);
2952     if (ARCH_ALPHA)      ff_dsputil_init_alpha (c, avctx);
2953     if (ARCH_PPC)        ff_dsputil_init_ppc   (c, avctx);
2954     if (ARCH_SH4)        ff_dsputil_init_sh4   (c, avctx);
2955     if (ARCH_BFIN)       ff_dsputil_init_bfin  (c, avctx);
2956
2957     for (i = 0; i < 4; i++) {
2958         for (j = 0; j < 16; j++) {
2959             if(!c->put_2tap_qpel_pixels_tab[i][j])
2960                 c->put_2tap_qpel_pixels_tab[i][j] =
2961                     c->put_h264_qpel_pixels_tab[i][j];
2962             if(!c->avg_2tap_qpel_pixels_tab[i][j])
2963                 c->avg_2tap_qpel_pixels_tab[i][j] =
2964                     c->avg_h264_qpel_pixels_tab[i][j];
2965         }
2966     }
2967
2968     ff_init_scantable_permutation(c->idct_permutation,
2969                                   c->idct_permutation_type);
2970 }