git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "libavutil/internal.h"
  32 #include "avcodec.h"
  33 #include "copy_block.h"
  34 #include "dct.h"
  35 #include "dsputil.h"
  36 #include "simple_idct.h"
  37 #include "faandct.h"
  38 #include "faanidct.h"
  39 #include "imgconvert.h"
  40 #include "mathops.h"
  41 #include "mpegvideo.h"
  42 #include "config.h"
  43
  44 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  45 uint32_t ff_squareTbl[512] = {0, };
  46
  47 #define BIT_DEPTH 9
  48 #include "dsputil_template.c"
  49 #undef BIT_DEPTH
  50
  51 #define BIT_DEPTH 10
  52 #include "dsputil_template.c"
  53 #undef BIT_DEPTH
  54
  55 #define BIT_DEPTH 8
  56 #include "dsputil_template.c"
  57
  58 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  59 #define pb_7f (~0UL/255 * 0x7f)
  60 #define pb_80 (~0UL/255 * 0x80)
  61
  62 const uint8_t ff_zigzag_direct[64] = {
  63     0,   1,  8, 16,  9,  2,  3, 10,
  64     17, 24, 32, 25, 18, 11,  4,  5,
  65     12, 19, 26, 33, 40, 48, 41, 34,
  66     27, 20, 13,  6,  7, 14, 21, 28,
  67     35, 42, 49, 56, 57, 50, 43, 36,
  68     29, 22, 15, 23, 30, 37, 44, 51,
  69     58, 59, 52, 45, 38, 31, 39, 46,
  70     53, 60, 61, 54, 47, 55, 62, 63
  71 };
  72
  73 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  74    specification, we interleave the fields */
  75 const uint8_t ff_zigzag248_direct[64] = {
  76      0,  8,  1,  9, 16, 24,  2, 10,
  77     17, 25, 32, 40, 48, 56, 33, 41,
  78     18, 26,  3, 11,  4, 12, 19, 27,
  79     34, 42, 49, 57, 50, 58, 35, 43,
  80     20, 28,  5, 13,  6, 14, 21, 29,
  81     36, 44, 51, 59, 52, 60, 37, 45,
  82     22, 30,  7, 15, 23, 31, 38, 46,
  83     53, 61, 54, 62, 39, 47, 55, 63,
  84 };
  85
  86 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  87 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
  88
  89 const uint8_t ff_alternate_horizontal_scan[64] = {
  90     0,  1,   2,  3,  8,  9, 16, 17,
  91     10, 11,  4,  5,  6,  7, 15, 14,
  92     13, 12, 19, 18, 24, 25, 32, 33,
  93     26, 27, 20, 21, 22, 23, 28, 29,
  94     30, 31, 34, 35, 40, 41, 48, 49,
  95     42, 43, 36, 37, 38, 39, 44, 45,
  96     46, 47, 50, 51, 56, 57, 58, 59,
  97     52, 53, 54, 55, 60, 61, 62, 63,
  98 };
  99
 100 const uint8_t ff_alternate_vertical_scan[64] = {
 101     0,  8,  16, 24,  1,  9,  2, 10,
 102     17, 25, 32, 40, 48, 56, 57, 49,
 103     41, 33, 26, 18,  3, 11,  4, 12,
 104     19, 27, 34, 42, 50, 58, 35, 43,
 105     51, 59, 20, 28,  5, 13,  6, 14,
 106     21, 29, 36, 44, 52, 60, 37, 45,
 107     53, 61, 22, 30,  7, 15, 23, 31,
 108     38, 46, 54, 62, 39, 47, 55, 63,
 109 };
 110
 111 /* Input permutation for the simple_idct_mmx */
 112 static const uint8_t simple_mmx_permutation[64]={
 113         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 114         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 115         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 116         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 117         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 118         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 119         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 120         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 121 };
 122
 123 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 124
 125 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 126     int i;
 127     int end;
 128
 129     st->scantable= src_scantable;
 130
 131     for(i=0; i<64; i++){
 132         int j;
 133         j = src_scantable[i];
 134         st->permutated[i] = permutation[j];
 135     }
 136
 137     end=-1;
 138     for(i=0; i<64; i++){
 139         int j;
 140         j = st->permutated[i];
 141         if(j>end) end=j;
 142         st->raster_end[i]= end;
 143     }
 144 }
 145
 146 void ff_init_scantable_permutation(uint8_t *idct_permutation,
 147                                    int idct_permutation_type)
 148 {
 149     int i;
 150
 151     switch(idct_permutation_type){
 152     case FF_NO_IDCT_PERM:
 153         for(i=0; i<64; i++)
 154             idct_permutation[i]= i;
 155         break;
 156     case FF_LIBMPEG2_IDCT_PERM:
 157         for(i=0; i<64; i++)
 158             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 159         break;
 160     case FF_SIMPLE_IDCT_PERM:
 161         for(i=0; i<64; i++)
 162             idct_permutation[i]= simple_mmx_permutation[i];
 163         break;
 164     case FF_TRANSPOSE_IDCT_PERM:
 165         for(i=0; i<64; i++)
 166             idct_permutation[i]= ((i&7)<<3) | (i>>3);
 167         break;
 168     case FF_PARTTRANS_IDCT_PERM:
 169         for(i=0; i<64; i++)
 170             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
 171         break;
 172     case FF_SSE2_IDCT_PERM:
 173         for(i=0; i<64; i++)
 174             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
 175         break;
 176     default:
 177         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
 178     }
 179 }
 180
 181 static int pix_sum_c(uint8_t * pix, int line_size)
 182 {
 183     int s, i, j;
 184
 185     s = 0;
 186     for (i = 0; i < 16; i++) {
 187         for (j = 0; j < 16; j += 8) {
 188             s += pix[0];
 189             s += pix[1];
 190             s += pix[2];
 191             s += pix[3];
 192             s += pix[4];
 193             s += pix[5];
 194             s += pix[6];
 195             s += pix[7];
 196             pix += 8;
 197         }
 198         pix += line_size - 16;
 199     }
 200     return s;
 201 }
 202
 203 static int pix_norm1_c(uint8_t * pix, int line_size)
 204 {
 205     int s, i, j;
 206     uint32_t *sq = ff_squareTbl + 256;
 207
 208     s = 0;
 209     for (i = 0; i < 16; i++) {
 210         for (j = 0; j < 16; j += 8) {
 211 #if 0
 212             s += sq[pix[0]];
 213             s += sq[pix[1]];
 214             s += sq[pix[2]];
 215             s += sq[pix[3]];
 216             s += sq[pix[4]];
 217             s += sq[pix[5]];
 218             s += sq[pix[6]];
 219             s += sq[pix[7]];
 220 #else
 221 #if HAVE_FAST_64BIT
 222             register uint64_t x=*(uint64_t*)pix;
 223             s += sq[x&0xff];
 224             s += sq[(x>>8)&0xff];
 225             s += sq[(x>>16)&0xff];
 226             s += sq[(x>>24)&0xff];
 227             s += sq[(x>>32)&0xff];
 228             s += sq[(x>>40)&0xff];
 229             s += sq[(x>>48)&0xff];
 230             s += sq[(x>>56)&0xff];
 231 #else
 232             register uint32_t x=*(uint32_t*)pix;
 233             s += sq[x&0xff];
 234             s += sq[(x>>8)&0xff];
 235             s += sq[(x>>16)&0xff];
 236             s += sq[(x>>24)&0xff];
 237             x=*(uint32_t*)(pix+4);
 238             s += sq[x&0xff];
 239             s += sq[(x>>8)&0xff];
 240             s += sq[(x>>16)&0xff];
 241             s += sq[(x>>24)&0xff];
 242 #endif
 243 #endif
 244             pix += 8;
 245         }
 246         pix += line_size - 16;
 247     }
 248     return s;
 249 }
 250
 251 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 252     int i;
 253
 254     for(i=0; i+8<=w; i+=8){
 255         dst[i+0]= av_bswap32(src[i+0]);
 256         dst[i+1]= av_bswap32(src[i+1]);
 257         dst[i+2]= av_bswap32(src[i+2]);
 258         dst[i+3]= av_bswap32(src[i+3]);
 259         dst[i+4]= av_bswap32(src[i+4]);
 260         dst[i+5]= av_bswap32(src[i+5]);
 261         dst[i+6]= av_bswap32(src[i+6]);
 262         dst[i+7]= av_bswap32(src[i+7]);
 263     }
 264     for(;i<w; i++){
 265         dst[i+0]= av_bswap32(src[i+0]);
 266     }
 267 }
 268
 269 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 270 {
 271     while (len--)
 272         *dst++ = av_bswap16(*src++);
 273 }
 274
 275 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 276 {
 277     int s, i;
 278     uint32_t *sq = ff_squareTbl + 256;
 279
 280     s = 0;
 281     for (i = 0; i < h; i++) {
 282         s += sq[pix1[0] - pix2[0]];
 283         s += sq[pix1[1] - pix2[1]];
 284         s += sq[pix1[2] - pix2[2]];
 285         s += sq[pix1[3] - pix2[3]];
 286         pix1 += line_size;
 287         pix2 += line_size;
 288     }
 289     return s;
 290 }
 291
 292 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 293 {
 294     int s, i;
 295     uint32_t *sq = ff_squareTbl + 256;
 296
 297     s = 0;
 298     for (i = 0; i < h; i++) {
 299         s += sq[pix1[0] - pix2[0]];
 300         s += sq[pix1[1] - pix2[1]];
 301         s += sq[pix1[2] - pix2[2]];
 302         s += sq[pix1[3] - pix2[3]];
 303         s += sq[pix1[4] - pix2[4]];
 304         s += sq[pix1[5] - pix2[5]];
 305         s += sq[pix1[6] - pix2[6]];
 306         s += sq[pix1[7] - pix2[7]];
 307         pix1 += line_size;
 308         pix2 += line_size;
 309     }
 310     return s;
 311 }
 312
 313 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 314 {
 315     int s, i;
 316     uint32_t *sq = ff_squareTbl + 256;
 317
 318     s = 0;
 319     for (i = 0; i < h; i++) {
 320         s += sq[pix1[ 0] - pix2[ 0]];
 321         s += sq[pix1[ 1] - pix2[ 1]];
 322         s += sq[pix1[ 2] - pix2[ 2]];
 323         s += sq[pix1[ 3] - pix2[ 3]];
 324         s += sq[pix1[ 4] - pix2[ 4]];
 325         s += sq[pix1[ 5] - pix2[ 5]];
 326         s += sq[pix1[ 6] - pix2[ 6]];
 327         s += sq[pix1[ 7] - pix2[ 7]];
 328         s += sq[pix1[ 8] - pix2[ 8]];
 329         s += sq[pix1[ 9] - pix2[ 9]];
 330         s += sq[pix1[10] - pix2[10]];
 331         s += sq[pix1[11] - pix2[11]];
 332         s += sq[pix1[12] - pix2[12]];
 333         s += sq[pix1[13] - pix2[13]];
 334         s += sq[pix1[14] - pix2[14]];
 335         s += sq[pix1[15] - pix2[15]];
 336
 337         pix1 += line_size;
 338         pix2 += line_size;
 339     }
 340     return s;
 341 }
 342
 343 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
 344                           const uint8_t *s2, int stride){
 345     int i;
 346
 347     /* read the pixels */
 348     for(i=0;i<8;i++) {
 349         block[0] = s1[0] - s2[0];
 350         block[1] = s1[1] - s2[1];
 351         block[2] = s1[2] - s2[2];
 352         block[3] = s1[3] - s2[3];
 353         block[4] = s1[4] - s2[4];
 354         block[5] = s1[5] - s2[5];
 355         block[6] = s1[6] - s2[6];
 356         block[7] = s1[7] - s2[7];
 357         s1 += stride;
 358         s2 += stride;
 359         block += 8;
 360     }
 361 }
 362
 363
 364 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 365                                  int line_size)
 366 {
 367     int i;
 368
 369     /* read the pixels */
 370     for(i=0;i<8;i++) {
 371         pixels[0] = av_clip_uint8(block[0]);
 372         pixels[1] = av_clip_uint8(block[1]);
 373         pixels[2] = av_clip_uint8(block[2]);
 374         pixels[3] = av_clip_uint8(block[3]);
 375         pixels[4] = av_clip_uint8(block[4]);
 376         pixels[5] = av_clip_uint8(block[5]);
 377         pixels[6] = av_clip_uint8(block[6]);
 378         pixels[7] = av_clip_uint8(block[7]);
 379
 380         pixels += line_size;
 381         block += 8;
 382     }
 383 }
 384
 385 static void put_signed_pixels_clamped_c(const int16_t *block,
 386                                         uint8_t *restrict pixels,
 387                                         int line_size)
 388 {
 389     int i, j;
 390
 391     for (i = 0; i < 8; i++) {
 392         for (j = 0; j < 8; j++) {
 393             if (*block < -128)
 394                 *pixels = 0;
 395             else if (*block > 127)
 396                 *pixels = 255;
 397             else
 398                 *pixels = (uint8_t)(*block + 128);
 399             block++;
 400             pixels++;
 401         }
 402         pixels += (line_size - 8);
 403     }
 404 }
 405
 406 static void add_pixels8_c(uint8_t *restrict pixels,
 407                           int16_t *block,
 408                           int line_size)
 409 {
 410     int i;
 411
 412     for(i=0;i<8;i++) {
 413         pixels[0] += block[0];
 414         pixels[1] += block[1];
 415         pixels[2] += block[2];
 416         pixels[3] += block[3];
 417         pixels[4] += block[4];
 418         pixels[5] += block[5];
 419         pixels[6] += block[6];
 420         pixels[7] += block[7];
 421         pixels += line_size;
 422         block += 8;
 423     }
 424 }
 425
 426 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 427                                  int line_size)
 428 {
 429     int i;
 430
 431     /* read the pixels */
 432     for(i=0;i<8;i++) {
 433         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 434         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 435         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 436         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 437         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
 438         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
 439         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
 440         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
 441         pixels += line_size;
 442         block += 8;
 443     }
 444 }
 445
 446 static int sum_abs_dctelem_c(int16_t *block)
 447 {
 448     int sum=0, i;
 449     for(i=0; i<64; i++)
 450         sum+= FFABS(block[i]);
 451     return sum;
 452 }
 453
 454 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 455 {
 456     int i;
 457
 458     for (i = 0; i < h; i++) {
 459         memset(block, value, 16);
 460         block += line_size;
 461     }
 462 }
 463
 464 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 465 {
 466     int i;
 467
 468     for (i = 0; i < h; i++) {
 469         memset(block, value, 8);
 470         block += line_size;
 471     }
 472 }
 473
 474 #define avg2(a,b) ((a+b+1)>>1)
 475 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 476
 477 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 478 {
 479     const int A=(16-x16)*(16-y16);
 480     const int B=(   x16)*(16-y16);
 481     const int C=(16-x16)*(   y16);
 482     const int D=(   x16)*(   y16);
 483     int i;
 484
 485     for(i=0; i<h; i++)
 486     {
 487         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 488         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 489         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 490         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 491         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 492         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 493         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 494         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 495         dst+= stride;
 496         src+= stride;
 497     }
 498 }
 499
 500 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 501                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 502 {
 503     int y, vx, vy;
 504     const int s= 1<<shift;
 505
 506     width--;
 507     height--;
 508
 509     for(y=0; y<h; y++){
 510         int x;
 511
 512         vx= ox;
 513         vy= oy;
 514         for(x=0; x<8; x++){ //XXX FIXME optimize
 515             int src_x, src_y, frac_x, frac_y, index;
 516
 517             src_x= vx>>16;
 518             src_y= vy>>16;
 519             frac_x= src_x&(s-1);
 520             frac_y= src_y&(s-1);
 521             src_x>>=shift;
 522             src_y>>=shift;
 523
 524             if((unsigned)src_x < width){
 525                 if((unsigned)src_y < height){
 526                     index= src_x + src_y*stride;
 527                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 528                                            + src[index       +1]*   frac_x )*(s-frac_y)
 529                                         + (  src[index+stride  ]*(s-frac_x)
 530                                            + src[index+stride+1]*   frac_x )*   frac_y
 531                                         + r)>>(shift*2);
 532                 }else{
 533                     index= src_x + av_clip(src_y, 0, height)*stride;
 534                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 535                                           + src[index       +1]*   frac_x )*s
 536                                         + r)>>(shift*2);
 537                 }
 538             }else{
 539                 if((unsigned)src_y < height){
 540                     index= av_clip(src_x, 0, width) + src_y*stride;
 541                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 542                                            + src[index+stride  ]*   frac_y )*s
 543                                         + r)>>(shift*2);
 544                 }else{
 545                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 546                     dst[y*stride + x]=    src[index         ];
 547                 }
 548             }
 549
 550             vx+= dxx;
 551             vy+= dyx;
 552         }
 553         ox += dxy;
 554         oy += dyy;
 555     }
 556 }
 557
 558 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 559     switch(width){
 560     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 561     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 562     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 563     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 564     }
 565 }
 566
 567 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 568     int i,j;
 569     for (i=0; i < height; i++) {
 570       for (j=0; j < width; j++) {
 571         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 572       }
 573       src += stride;
 574       dst += stride;
 575     }
 576 }
 577
 578 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 579     int i,j;
 580     for (i=0; i < height; i++) {
 581       for (j=0; j < width; j++) {
 582         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 583       }
 584       src += stride;
 585       dst += stride;
 586     }
 587 }
 588
 589 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 590     int i,j;
 591     for (i=0; i < height; i++) {
 592       for (j=0; j < width; j++) {
 593         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 594       }
 595       src += stride;
 596       dst += stride;
 597     }
 598 }
 599
 600 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 601     int i,j;
 602     for (i=0; i < height; i++) {
 603       for (j=0; j < width; j++) {
 604         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 605       }
 606       src += stride;
 607       dst += stride;
 608     }
 609 }
 610
 611 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 612     int i,j;
 613     for (i=0; i < height; i++) {
 614       for (j=0; j < width; j++) {
 615         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 616       }
 617       src += stride;
 618       dst += stride;
 619     }
 620 }
 621
 622 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 623     int i,j;
 624     for (i=0; i < height; i++) {
 625       for (j=0; j < width; j++) {
 626         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 627       }
 628       src += stride;
 629       dst += stride;
 630     }
 631 }
 632
 633 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 634     int i,j;
 635     for (i=0; i < height; i++) {
 636       for (j=0; j < width; j++) {
 637         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 638       }
 639       src += stride;
 640       dst += stride;
 641     }
 642 }
 643
 644 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 645     int i,j;
 646     for (i=0; i < height; i++) {
 647       for (j=0; j < width; j++) {
 648         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 649       }
 650       src += stride;
 651       dst += stride;
 652     }
 653 }
 654
 655 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 656     switch(width){
 657     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 658     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 659     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 660     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 661     }
 662 }
 663
 664 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 665     int i,j;
 666     for (i=0; i < height; i++) {
 667       for (j=0; j < width; j++) {
 668         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 669       }
 670       src += stride;
 671       dst += stride;
 672     }
 673 }
 674
 675 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 676     int i,j;
 677     for (i=0; i < height; i++) {
 678       for (j=0; j < width; j++) {
 679         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 680       }
 681       src += stride;
 682       dst += stride;
 683     }
 684 }
 685
 686 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 687     int i,j;
 688     for (i=0; i < height; i++) {
 689       for (j=0; j < width; j++) {
 690         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 691       }
 692       src += stride;
 693       dst += stride;
 694     }
 695 }
 696
 697 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 698     int i,j;
 699     for (i=0; i < height; i++) {
 700       for (j=0; j < width; j++) {
 701         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 702       }
 703       src += stride;
 704       dst += stride;
 705     }
 706 }
 707
 708 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 709     int i,j;
 710     for (i=0; i < height; i++) {
 711       for (j=0; j < width; j++) {
 712         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 713       }
 714       src += stride;
 715       dst += stride;
 716     }
 717 }
 718
 719 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 720     int i,j;
 721     for (i=0; i < height; i++) {
 722       for (j=0; j < width; j++) {
 723         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 724       }
 725       src += stride;
 726       dst += stride;
 727     }
 728 }
 729
 730 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 731     int i,j;
 732     for (i=0; i < height; i++) {
 733       for (j=0; j < width; j++) {
 734         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 735       }
 736       src += stride;
 737       dst += stride;
 738     }
 739 }
 740
 741 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 742     int i,j;
 743     for (i=0; i < height; i++) {
 744       for (j=0; j < width; j++) {
 745         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 746       }
 747       src += stride;
 748       dst += stride;
 749     }
 750 }
 751
 752 #define QPEL_MC(r, OPNAME, RND, OP) \
 753 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 754     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 755     int i;\
 756     for(i=0; i<h; i++)\
 757     {\
 758         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 759         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 760         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 761         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 762         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 763         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 764         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 765         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 766         dst+=dstStride;\
 767         src+=srcStride;\
 768     }\
 769 }\
 770 \
 771 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 772     const int w=8;\
 773     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 774     int i;\
 775     for(i=0; i<w; i++)\
 776     {\
 777         const int src0= src[0*srcStride];\
 778         const int src1= src[1*srcStride];\
 779         const int src2= src[2*srcStride];\
 780         const int src3= src[3*srcStride];\
 781         const int src4= src[4*srcStride];\
 782         const int src5= src[5*srcStride];\
 783         const int src6= src[6*srcStride];\
 784         const int src7= src[7*srcStride];\
 785         const int src8= src[8*srcStride];\
 786         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 787         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 788         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 789         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 790         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 791         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 792         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 793         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 794         dst++;\
 795         src++;\
 796     }\
 797 }\
 798 \
 799 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 800     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 801     int i;\
 802     \
 803     for(i=0; i<h; i++)\
 804     {\
 805         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 806         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 807         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 808         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 809         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 810         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 811         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 812         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 813         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 814         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 815         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 816         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 817         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 818         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 819         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 820         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 821         dst+=dstStride;\
 822         src+=srcStride;\
 823     }\
 824 }\
 825 \
 826 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 827     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 828     int i;\
 829     const int w=16;\
 830     for(i=0; i<w; i++)\
 831     {\
 832         const int src0= src[0*srcStride];\
 833         const int src1= src[1*srcStride];\
 834         const int src2= src[2*srcStride];\
 835         const int src3= src[3*srcStride];\
 836         const int src4= src[4*srcStride];\
 837         const int src5= src[5*srcStride];\
 838         const int src6= src[6*srcStride];\
 839         const int src7= src[7*srcStride];\
 840         const int src8= src[8*srcStride];\
 841         const int src9= src[9*srcStride];\
 842         const int src10= src[10*srcStride];\
 843         const int src11= src[11*srcStride];\
 844         const int src12= src[12*srcStride];\
 845         const int src13= src[13*srcStride];\
 846         const int src14= src[14*srcStride];\
 847         const int src15= src[15*srcStride];\
 848         const int src16= src[16*srcStride];\
 849         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 850         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 851         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 852         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 853         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 854         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 855         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 856         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 857         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 858         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 859         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 860         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 861         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 862         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 863         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 864         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 865         dst++;\
 866         src++;\
 867     }\
 868 }\
 869 \
 870 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 871 {\
 872     uint8_t half[64];\
 873     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 874     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 875 }\
 876 \
 877 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 878 {\
 879     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 880 }\
 881 \
 882 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 883 {\
 884     uint8_t half[64];\
 885     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 886     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 887 }\
 888 \
 889 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 890 {\
 891     uint8_t full[16*9];\
 892     uint8_t half[64];\
 893     copy_block9(full, src, 16, stride, 9);\
 894     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 895     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 896 }\
 897 \
 898 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 899 {\
 900     uint8_t full[16*9];\
 901     copy_block9(full, src, 16, stride, 9);\
 902     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 903 }\
 904 \
 905 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 906 {\
 907     uint8_t full[16*9];\
 908     uint8_t half[64];\
 909     copy_block9(full, src, 16, stride, 9);\
 910     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 911     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 912 }\
 913 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 914 {\
 915     uint8_t full[16*9];\
 916     uint8_t halfH[72];\
 917     uint8_t halfV[64];\
 918     uint8_t halfHV[64];\
 919     copy_block9(full, src, 16, stride, 9);\
 920     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 921     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 922     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 923     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 924 }\
 925 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 926 {\
 927     uint8_t full[16*9];\
 928     uint8_t halfH[72];\
 929     uint8_t halfHV[64];\
 930     copy_block9(full, src, 16, stride, 9);\
 931     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 932     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 933     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 934     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 935 }\
 936 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 937 {\
 938     uint8_t full[16*9];\
 939     uint8_t halfH[72];\
 940     uint8_t halfV[64];\
 941     uint8_t halfHV[64];\
 942     copy_block9(full, src, 16, stride, 9);\
 943     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 944     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 945     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 946     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 947 }\
 948 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 949 {\
 950     uint8_t full[16*9];\
 951     uint8_t halfH[72];\
 952     uint8_t halfHV[64];\
 953     copy_block9(full, src, 16, stride, 9);\
 954     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 955     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 956     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 957     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 958 }\
 959 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 960 {\
 961     uint8_t full[16*9];\
 962     uint8_t halfH[72];\
 963     uint8_t halfV[64];\
 964     uint8_t halfHV[64];\
 965     copy_block9(full, src, 16, stride, 9);\
 966     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 967     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 968     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 969     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 970 }\
 971 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 972 {\
 973     uint8_t full[16*9];\
 974     uint8_t halfH[72];\
 975     uint8_t halfHV[64];\
 976     copy_block9(full, src, 16, stride, 9);\
 977     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 978     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 979     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 980     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 981 }\
 982 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 983 {\
 984     uint8_t full[16*9];\
 985     uint8_t halfH[72];\
 986     uint8_t halfV[64];\
 987     uint8_t halfHV[64];\
 988     copy_block9(full, src, 16, stride, 9);\
 989     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
 990     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 991     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 992     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 993 }\
 994 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 995 {\
 996     uint8_t full[16*9];\
 997     uint8_t halfH[72];\
 998     uint8_t halfHV[64];\
 999     copy_block9(full, src, 16, stride, 9);\
1000     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1001     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1002     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1003     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1004 }\
1005 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1006 {\
1007     uint8_t halfH[72];\
1008     uint8_t halfHV[64];\
1009     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1010     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1011     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1012 }\
1013 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1014 {\
1015     uint8_t halfH[72];\
1016     uint8_t halfHV[64];\
1017     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1018     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1019     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1020 }\
1021 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1022 {\
1023     uint8_t full[16*9];\
1024     uint8_t halfH[72];\
1025     uint8_t halfV[64];\
1026     uint8_t halfHV[64];\
1027     copy_block9(full, src, 16, stride, 9);\
1028     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1029     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1030     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1031     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1032 }\
1033 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1034 {\
1035     uint8_t full[16*9];\
1036     uint8_t halfH[72];\
1037     copy_block9(full, src, 16, stride, 9);\
1038     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1039     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1040     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1041 }\
1042 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1043 {\
1044     uint8_t full[16*9];\
1045     uint8_t halfH[72];\
1046     uint8_t halfV[64];\
1047     uint8_t halfHV[64];\
1048     copy_block9(full, src, 16, stride, 9);\
1049     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1050     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1051     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1052     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1053 }\
1054 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1055 {\
1056     uint8_t full[16*9];\
1057     uint8_t halfH[72];\
1058     copy_block9(full, src, 16, stride, 9);\
1059     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1060     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1061     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1062 }\
1063 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1064 {\
1065     uint8_t halfH[72];\
1066     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1067     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1068 }\
1069 \
1070 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1071 {\
1072     uint8_t half[256];\
1073     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1074     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1075 }\
1076 \
1077 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1078 {\
1079     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1080 }\
1081 \
1082 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1083 {\
1084     uint8_t half[256];\
1085     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1086     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1087 }\
1088 \
1089 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1090 {\
1091     uint8_t full[24*17];\
1092     uint8_t half[256];\
1093     copy_block17(full, src, 24, stride, 17);\
1094     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1095     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1096 }\
1097 \
1098 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1099 {\
1100     uint8_t full[24*17];\
1101     copy_block17(full, src, 24, stride, 17);\
1102     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1103 }\
1104 \
1105 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1106 {\
1107     uint8_t full[24*17];\
1108     uint8_t half[256];\
1109     copy_block17(full, src, 24, stride, 17);\
1110     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1111     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1112 }\
1113 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1114 {\
1115     uint8_t full[24*17];\
1116     uint8_t halfH[272];\
1117     uint8_t halfV[256];\
1118     uint8_t halfHV[256];\
1119     copy_block17(full, src, 24, stride, 17);\
1120     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1121     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1122     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1123     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1124 }\
1125 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1126 {\
1127     uint8_t full[24*17];\
1128     uint8_t halfH[272];\
1129     uint8_t halfHV[256];\
1130     copy_block17(full, src, 24, stride, 17);\
1131     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1132     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1133     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1134     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1135 }\
1136 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1137 {\
1138     uint8_t full[24*17];\
1139     uint8_t halfH[272];\
1140     uint8_t halfV[256];\
1141     uint8_t halfHV[256];\
1142     copy_block17(full, src, 24, stride, 17);\
1143     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1144     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1145     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1146     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1147 }\
1148 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1149 {\
1150     uint8_t full[24*17];\
1151     uint8_t halfH[272];\
1152     uint8_t halfHV[256];\
1153     copy_block17(full, src, 24, stride, 17);\
1154     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1155     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1156     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1157     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1158 }\
1159 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1160 {\
1161     uint8_t full[24*17];\
1162     uint8_t halfH[272];\
1163     uint8_t halfV[256];\
1164     uint8_t halfHV[256];\
1165     copy_block17(full, src, 24, stride, 17);\
1166     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1167     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1168     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1169     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1170 }\
1171 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1172 {\
1173     uint8_t full[24*17];\
1174     uint8_t halfH[272];\
1175     uint8_t halfHV[256];\
1176     copy_block17(full, src, 24, stride, 17);\
1177     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1178     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1179     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1180     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1181 }\
1182 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1183 {\
1184     uint8_t full[24*17];\
1185     uint8_t halfH[272];\
1186     uint8_t halfV[256];\
1187     uint8_t halfHV[256];\
1188     copy_block17(full, src, 24, stride, 17);\
1189     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1190     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1191     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1192     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1193 }\
1194 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1195 {\
1196     uint8_t full[24*17];\
1197     uint8_t halfH[272];\
1198     uint8_t halfHV[256];\
1199     copy_block17(full, src, 24, stride, 17);\
1200     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1201     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1202     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1203     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1204 }\
1205 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1206 {\
1207     uint8_t halfH[272];\
1208     uint8_t halfHV[256];\
1209     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1210     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1211     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1212 }\
1213 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1214 {\
1215     uint8_t halfH[272];\
1216     uint8_t halfHV[256];\
1217     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1218     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1219     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1220 }\
1221 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1222 {\
1223     uint8_t full[24*17];\
1224     uint8_t halfH[272];\
1225     uint8_t halfV[256];\
1226     uint8_t halfHV[256];\
1227     copy_block17(full, src, 24, stride, 17);\
1228     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1229     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1230     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1231     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1232 }\
1233 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1234 {\
1235     uint8_t full[24*17];\
1236     uint8_t halfH[272];\
1237     copy_block17(full, src, 24, stride, 17);\
1238     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1239     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1240     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1241 }\
1242 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1243 {\
1244     uint8_t full[24*17];\
1245     uint8_t halfH[272];\
1246     uint8_t halfV[256];\
1247     uint8_t halfHV[256];\
1248     copy_block17(full, src, 24, stride, 17);\
1249     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1250     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1251     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1252     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1253 }\
1254 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1255 {\
1256     uint8_t full[24*17];\
1257     uint8_t halfH[272];\
1258     copy_block17(full, src, 24, stride, 17);\
1259     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1260     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1261     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1262 }\
1263 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1264 {\
1265     uint8_t halfH[272];\
1266     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1267     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1268 }
1269
1270 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1271 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1272 #define op_put(a, b) a = cm[((b) + 16)>>5]
1273 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1274
1275 QPEL_MC(0, put_       , _       , op_put)
1276 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1277 QPEL_MC(0, avg_       , _       , op_avg)
1278 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1279 #undef op_avg
1280 #undef op_avg_no_rnd
1281 #undef op_put
1282 #undef op_put_no_rnd
1283
1284 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1285 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1286 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1287 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1288 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1289 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1290
1291 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1292     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1293     int i;
1294
1295     for(i=0; i<h; i++){
1296         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1297         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1298         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1299         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1300         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1301         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1302         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1303         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1304         dst+=dstStride;
1305         src+=srcStride;
1306     }
1307 }
1308
1309 #if CONFIG_RV40_DECODER
1310 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1311 {
1312     put_pixels16_xy2_8_c(dst, src, stride, 16);
1313 }
1314 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1315 {
1316     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1317 }
1318 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1319 {
1320     put_pixels8_xy2_8_c(dst, src, stride, 8);
1321 }
1322 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1323 {
1324     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1325 }
1326 #endif /* CONFIG_RV40_DECODER */
1327
1328 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1329     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1330     int i;
1331
1332     for(i=0; i<w; i++){
1333         const int src_1= src[ -srcStride];
1334         const int src0 = src[0          ];
1335         const int src1 = src[  srcStride];
1336         const int src2 = src[2*srcStride];
1337         const int src3 = src[3*srcStride];
1338         const int src4 = src[4*srcStride];
1339         const int src5 = src[5*srcStride];
1340         const int src6 = src[6*srcStride];
1341         const int src7 = src[7*srcStride];
1342         const int src8 = src[8*srcStride];
1343         const int src9 = src[9*srcStride];
1344         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1345         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1346         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1347         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1348         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1349         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1350         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1351         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1352         src++;
1353         dst++;
1354     }
1355 }
1356
1357 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1358 {
1359     uint8_t half[64];
1360     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1361     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1362 }
1363
1364 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1365 {
1366     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1367 }
1368
1369 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1370 {
1371     uint8_t half[64];
1372     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1373     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1374 }
1375
1376 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1377 {
1378     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1379 }
1380
1381 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1382 {
1383     uint8_t halfH[88];
1384     uint8_t halfV[64];
1385     uint8_t halfHV[64];
1386     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1387     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1388     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1389     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1390 }
1391 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1392 {
1393     uint8_t halfH[88];
1394     uint8_t halfV[64];
1395     uint8_t halfHV[64];
1396     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1397     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1398     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1399     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1400 }
1401 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1402 {
1403     uint8_t halfH[88];
1404     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1405     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1406 }
1407
1408 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1409     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1410     int x;
1411     const int strength= ff_h263_loop_filter_strength[qscale];
1412
1413     for(x=0; x<8; x++){
1414         int d1, d2, ad1;
1415         int p0= src[x-2*stride];
1416         int p1= src[x-1*stride];
1417         int p2= src[x+0*stride];
1418         int p3= src[x+1*stride];
1419         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1420
1421         if     (d<-2*strength) d1= 0;
1422         else if(d<-  strength) d1=-2*strength - d;
1423         else if(d<   strength) d1= d;
1424         else if(d< 2*strength) d1= 2*strength - d;
1425         else                   d1= 0;
1426
1427         p1 += d1;
1428         p2 -= d1;
1429         if(p1&256) p1= ~(p1>>31);
1430         if(p2&256) p2= ~(p2>>31);
1431
1432         src[x-1*stride] = p1;
1433         src[x+0*stride] = p2;
1434
1435         ad1= FFABS(d1)>>1;
1436
1437         d2= av_clip((p0-p3)/4, -ad1, ad1);
1438
1439         src[x-2*stride] = p0 - d2;
1440         src[x+  stride] = p3 + d2;
1441     }
1442     }
1443 }
1444
1445 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1446     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1447     int y;
1448     const int strength= ff_h263_loop_filter_strength[qscale];
1449
1450     for(y=0; y<8; y++){
1451         int d1, d2, ad1;
1452         int p0= src[y*stride-2];
1453         int p1= src[y*stride-1];
1454         int p2= src[y*stride+0];
1455         int p3= src[y*stride+1];
1456         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1457
1458         if     (d<-2*strength) d1= 0;
1459         else if(d<-  strength) d1=-2*strength - d;
1460         else if(d<   strength) d1= d;
1461         else if(d< 2*strength) d1= 2*strength - d;
1462         else                   d1= 0;
1463
1464         p1 += d1;
1465         p2 -= d1;
1466         if(p1&256) p1= ~(p1>>31);
1467         if(p2&256) p2= ~(p2>>31);
1468
1469         src[y*stride-1] = p1;
1470         src[y*stride+0] = p2;
1471
1472         ad1= FFABS(d1)>>1;
1473
1474         d2= av_clip((p0-p3)/4, -ad1, ad1);
1475
1476         src[y*stride-2] = p0 - d2;
1477         src[y*stride+1] = p3 + d2;
1478     }
1479     }
1480 }
1481
1482 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1483 {
1484     int s, i;
1485
1486     s = 0;
1487     for(i=0;i<h;i++) {
1488         s += abs(pix1[0] - pix2[0]);
1489         s += abs(pix1[1] - pix2[1]);
1490         s += abs(pix1[2] - pix2[2]);
1491         s += abs(pix1[3] - pix2[3]);
1492         s += abs(pix1[4] - pix2[4]);
1493         s += abs(pix1[5] - pix2[5]);
1494         s += abs(pix1[6] - pix2[6]);
1495         s += abs(pix1[7] - pix2[7]);
1496         s += abs(pix1[8] - pix2[8]);
1497         s += abs(pix1[9] - pix2[9]);
1498         s += abs(pix1[10] - pix2[10]);
1499         s += abs(pix1[11] - pix2[11]);
1500         s += abs(pix1[12] - pix2[12]);
1501         s += abs(pix1[13] - pix2[13]);
1502         s += abs(pix1[14] - pix2[14]);
1503         s += abs(pix1[15] - pix2[15]);
1504         pix1 += line_size;
1505         pix2 += line_size;
1506     }
1507     return s;
1508 }
1509
1510 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1511 {
1512     int s, i;
1513
1514     s = 0;
1515     for(i=0;i<h;i++) {
1516         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1517         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1518         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1519         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1520         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1521         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1522         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1523         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1524         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1525         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1526         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1527         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1528         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1529         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1530         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1531         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1532         pix1 += line_size;
1533         pix2 += line_size;
1534     }
1535     return s;
1536 }
1537
1538 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1539 {
1540     int s, i;
1541     uint8_t *pix3 = pix2 + line_size;
1542
1543     s = 0;
1544     for(i=0;i<h;i++) {
1545         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1546         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1547         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1548         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1549         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1550         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1551         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1552         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1553         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1554         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1555         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1556         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1557         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1558         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1559         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1560         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1561         pix1 += line_size;
1562         pix2 += line_size;
1563         pix3 += line_size;
1564     }
1565     return s;
1566 }
1567
1568 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1569 {
1570     int s, i;
1571     uint8_t *pix3 = pix2 + line_size;
1572
1573     s = 0;
1574     for(i=0;i<h;i++) {
1575         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1576         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1577         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1578         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1579         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1580         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1581         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1582         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1583         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1584         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1585         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1586         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1587         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1588         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1589         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1590         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1591         pix1 += line_size;
1592         pix2 += line_size;
1593         pix3 += line_size;
1594     }
1595     return s;
1596 }
1597
1598 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1599 {
1600     int s, i;
1601
1602     s = 0;
1603     for(i=0;i<h;i++) {
1604         s += abs(pix1[0] - pix2[0]);
1605         s += abs(pix1[1] - pix2[1]);
1606         s += abs(pix1[2] - pix2[2]);
1607         s += abs(pix1[3] - pix2[3]);
1608         s += abs(pix1[4] - pix2[4]);
1609         s += abs(pix1[5] - pix2[5]);
1610         s += abs(pix1[6] - pix2[6]);
1611         s += abs(pix1[7] - pix2[7]);
1612         pix1 += line_size;
1613         pix2 += line_size;
1614     }
1615     return s;
1616 }
1617
1618 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1619 {
1620     int s, i;
1621
1622     s = 0;
1623     for(i=0;i<h;i++) {
1624         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1625         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1626         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1627         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1628         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1629         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1630         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1631         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1632         pix1 += line_size;
1633         pix2 += line_size;
1634     }
1635     return s;
1636 }
1637
1638 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1639 {
1640     int s, i;
1641     uint8_t *pix3 = pix2 + line_size;
1642
1643     s = 0;
1644     for(i=0;i<h;i++) {
1645         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1646         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1647         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1648         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1649         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1650         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1651         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1652         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1653         pix1 += line_size;
1654         pix2 += line_size;
1655         pix3 += line_size;
1656     }
1657     return s;
1658 }
1659
1660 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1661 {
1662     int s, i;
1663     uint8_t *pix3 = pix2 + line_size;
1664
1665     s = 0;
1666     for(i=0;i<h;i++) {
1667         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1668         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1669         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1670         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1671         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1672         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1673         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1674         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1675         pix1 += line_size;
1676         pix2 += line_size;
1677         pix3 += line_size;
1678     }
1679     return s;
1680 }
1681
1682 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1683     MpegEncContext *c = v;
1684     int score1=0;
1685     int score2=0;
1686     int x,y;
1687
1688     for(y=0; y<h; y++){
1689         for(x=0; x<16; x++){
1690             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1691         }
1692         if(y+1<h){
1693             for(x=0; x<15; x++){
1694                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1695                              - s1[x+1] + s1[x+1+stride])
1696                         -FFABS(  s2[x  ] - s2[x  +stride]
1697                              - s2[x+1] + s2[x+1+stride]);
1698             }
1699         }
1700         s1+= stride;
1701         s2+= stride;
1702     }
1703
1704     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1705     else  return score1 + FFABS(score2)*8;
1706 }
1707
1708 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1709     MpegEncContext *c = v;
1710     int score1=0;
1711     int score2=0;
1712     int x,y;
1713
1714     for(y=0; y<h; y++){
1715         for(x=0; x<8; x++){
1716             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1717         }
1718         if(y+1<h){
1719             for(x=0; x<7; x++){
1720                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1721                              - s1[x+1] + s1[x+1+stride])
1722                         -FFABS(  s2[x  ] - s2[x  +stride]
1723                              - s2[x+1] + s2[x+1+stride]);
1724             }
1725         }
1726         s1+= stride;
1727         s2+= stride;
1728     }
1729
1730     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1731     else  return score1 + FFABS(score2)*8;
1732 }
1733
1734 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1735     int i;
1736     unsigned int sum=0;
1737
1738     for(i=0; i<8*8; i++){
1739         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1740         int w= weight[i];
1741         b>>= RECON_SHIFT;
1742         assert(-512<b && b<512);
1743
1744         sum += (w*b)*(w*b)>>4;
1745     }
1746     return sum>>2;
1747 }
1748
1749 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1750     int i;
1751
1752     for(i=0; i<8*8; i++){
1753         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1754     }
1755 }
1756
1757 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1758     return 0;
1759 }
1760
1761 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1762     int i;
1763
1764     memset(cmp, 0, sizeof(void*)*6);
1765
1766     for(i=0; i<6; i++){
1767         switch(type&0xFF){
1768         case FF_CMP_SAD:
1769             cmp[i]= c->sad[i];
1770             break;
1771         case FF_CMP_SATD:
1772             cmp[i]= c->hadamard8_diff[i];
1773             break;
1774         case FF_CMP_SSE:
1775             cmp[i]= c->sse[i];
1776             break;
1777         case FF_CMP_DCT:
1778             cmp[i]= c->dct_sad[i];
1779             break;
1780         case FF_CMP_DCT264:
1781             cmp[i]= c->dct264_sad[i];
1782             break;
1783         case FF_CMP_DCTMAX:
1784             cmp[i]= c->dct_max[i];
1785             break;
1786         case FF_CMP_PSNR:
1787             cmp[i]= c->quant_psnr[i];
1788             break;
1789         case FF_CMP_BIT:
1790             cmp[i]= c->bit[i];
1791             break;
1792         case FF_CMP_RD:
1793             cmp[i]= c->rd[i];
1794             break;
1795         case FF_CMP_VSAD:
1796             cmp[i]= c->vsad[i];
1797             break;
1798         case FF_CMP_VSSE:
1799             cmp[i]= c->vsse[i];
1800             break;
1801         case FF_CMP_ZERO:
1802             cmp[i]= zero_cmp;
1803             break;
1804         case FF_CMP_NSSE:
1805             cmp[i]= c->nsse[i];
1806             break;
1807         default:
1808             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1809         }
1810     }
1811 }
1812
1813 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1814     long i;
1815     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1816         long a = *(long*)(src+i);
1817         long b = *(long*)(dst+i);
1818         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1819     }
1820     for(; i<w; i++)
1821         dst[i+0] += src[i+0];
1822 }
1823
1824 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1825     long i;
1826 #if !HAVE_FAST_UNALIGNED
1827     if((long)src2 & (sizeof(long)-1)){
1828         for(i=0; i+7<w; i+=8){
1829             dst[i+0] = src1[i+0]-src2[i+0];
1830             dst[i+1] = src1[i+1]-src2[i+1];
1831             dst[i+2] = src1[i+2]-src2[i+2];
1832             dst[i+3] = src1[i+3]-src2[i+3];
1833             dst[i+4] = src1[i+4]-src2[i+4];
1834             dst[i+5] = src1[i+5]-src2[i+5];
1835             dst[i+6] = src1[i+6]-src2[i+6];
1836             dst[i+7] = src1[i+7]-src2[i+7];
1837         }
1838     }else
1839 #endif
1840     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1841         long a = *(long*)(src1+i);
1842         long b = *(long*)(src2+i);
1843         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1844     }
1845     for(; i<w; i++)
1846         dst[i+0] = src1[i+0]-src2[i+0];
1847 }
1848
1849 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1850     int i;
1851     uint8_t l, lt;
1852
1853     l= *left;
1854     lt= *left_top;
1855
1856     for(i=0; i<w; i++){
1857         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1858         lt= src1[i];
1859         dst[i]= l;
1860     }
1861
1862     *left= l;
1863     *left_top= lt;
1864 }
1865
1866 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1867     int i;
1868     uint8_t l, lt;
1869
1870     l= *left;
1871     lt= *left_top;
1872
1873     for(i=0; i<w; i++){
1874         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1875         lt= src1[i];
1876         l= src2[i];
1877         dst[i]= l - pred;
1878     }
1879
1880     *left= l;
1881     *left_top= lt;
1882 }
1883
1884 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1885     int i;
1886
1887     for(i=0; i<w-1; i++){
1888         acc+= src[i];
1889         dst[i]= acc;
1890         i++;
1891         acc+= src[i];
1892         dst[i]= acc;
1893     }
1894
1895     for(; i<w; i++){
1896         acc+= src[i];
1897         dst[i]= acc;
1898     }
1899
1900     return acc;
1901 }
1902
1903 #if HAVE_BIGENDIAN
1904 #define B 3
1905 #define G 2
1906 #define R 1
1907 #define A 0
1908 #else
1909 #define B 0
1910 #define G 1
1911 #define R 2
1912 #define A 3
1913 #endif
1914 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1915     int i;
1916     int r,g,b,a;
1917     r= *red;
1918     g= *green;
1919     b= *blue;
1920     a= *alpha;
1921
1922     for(i=0; i<w; i++){
1923         b+= src[4*i+B];
1924         g+= src[4*i+G];
1925         r+= src[4*i+R];
1926         a+= src[4*i+A];
1927
1928         dst[4*i+B]= b;
1929         dst[4*i+G]= g;
1930         dst[4*i+R]= r;
1931         dst[4*i+A]= a;
1932     }
1933
1934     *red= r;
1935     *green= g;
1936     *blue= b;
1937     *alpha= a;
1938 }
1939 #undef B
1940 #undef G
1941 #undef R
1942 #undef A
1943
1944 #define BUTTERFLY2(o1,o2,i1,i2) \
1945 o1= (i1)+(i2);\
1946 o2= (i1)-(i2);
1947
1948 #define BUTTERFLY1(x,y) \
1949 {\
1950     int a,b;\
1951     a= x;\
1952     b= y;\
1953     x= a+b;\
1954     y= a-b;\
1955 }
1956
1957 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1958
1959 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1960     int i;
1961     int temp[64];
1962     int sum=0;
1963
1964     assert(h==8);
1965
1966     for(i=0; i<8; i++){
1967         //FIXME try pointer walks
1968         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1969         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1970         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1971         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1972
1973         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1974         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1975         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1976         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1977
1978         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1979         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1980         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1981         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1982     }
1983
1984     for(i=0; i<8; i++){
1985         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1986         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1987         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1988         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1989
1990         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1991         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1992         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1993         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1994
1995         sum +=
1996              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1997             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1998             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1999             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2000     }
2001     return sum;
2002 }
2003
2004 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2005     int i;
2006     int temp[64];
2007     int sum=0;
2008
2009     assert(h==8);
2010
2011     for(i=0; i<8; i++){
2012         //FIXME try pointer walks
2013         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2014         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2015         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2016         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2017
2018         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2019         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2020         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2021         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2022
2023         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2024         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2025         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2026         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2027     }
2028
2029     for(i=0; i<8; i++){
2030         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2031         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2032         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2033         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2034
2035         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2036         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2037         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2038         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2039
2040         sum +=
2041              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2042             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2043             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2044             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2045     }
2046
2047     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2048
2049     return sum;
2050 }
2051
2052 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2053     MpegEncContext * const s= (MpegEncContext *)c;
2054     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2055
2056     assert(h==8);
2057
2058     s->dsp.diff_pixels(temp, src1, src2, stride);
2059     s->dsp.fdct(temp);
2060     return s->dsp.sum_abs_dctelem(temp);
2061 }
2062
2063 #if CONFIG_GPL
2064 #define DCT8_1D {\
2065     const int s07 = SRC(0) + SRC(7);\
2066     const int s16 = SRC(1) + SRC(6);\
2067     const int s25 = SRC(2) + SRC(5);\
2068     const int s34 = SRC(3) + SRC(4);\
2069     const int a0 = s07 + s34;\
2070     const int a1 = s16 + s25;\
2071     const int a2 = s07 - s34;\
2072     const int a3 = s16 - s25;\
2073     const int d07 = SRC(0) - SRC(7);\
2074     const int d16 = SRC(1) - SRC(6);\
2075     const int d25 = SRC(2) - SRC(5);\
2076     const int d34 = SRC(3) - SRC(4);\
2077     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2078     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2079     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2080     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2081     DST(0,  a0 + a1     ) ;\
2082     DST(1,  a4 + (a7>>2)) ;\
2083     DST(2,  a2 + (a3>>1)) ;\
2084     DST(3,  a5 + (a6>>2)) ;\
2085     DST(4,  a0 - a1     ) ;\
2086     DST(5,  a6 - (a5>>2)) ;\
2087     DST(6, (a2>>1) - a3 ) ;\
2088     DST(7, (a4>>2) - a7 ) ;\
2089 }
2090
2091 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2092     MpegEncContext * const s= (MpegEncContext *)c;
2093     int16_t dct[8][8];
2094     int i;
2095     int sum=0;
2096
2097     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2098
2099 #define SRC(x) dct[i][x]
2100 #define DST(x,v) dct[i][x]= v
2101     for( i = 0; i < 8; i++ )
2102         DCT8_1D
2103 #undef SRC
2104 #undef DST
2105
2106 #define SRC(x) dct[x][i]
2107 #define DST(x,v) sum += FFABS(v)
2108     for( i = 0; i < 8; i++ )
2109         DCT8_1D
2110 #undef SRC
2111 #undef DST
2112     return sum;
2113 }
2114 #endif
2115
2116 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2117     MpegEncContext * const s= (MpegEncContext *)c;
2118     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2119     int sum=0, i;
2120
2121     assert(h==8);
2122
2123     s->dsp.diff_pixels(temp, src1, src2, stride);
2124     s->dsp.fdct(temp);
2125
2126     for(i=0; i<64; i++)
2127         sum= FFMAX(sum, FFABS(temp[i]));
2128
2129     return sum;
2130 }
2131
2132 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2133     MpegEncContext * const s= (MpegEncContext *)c;
2134     LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2135     int16_t * const bak = temp+64;
2136     int sum=0, i;
2137
2138     assert(h==8);
2139     s->mb_intra=0;
2140
2141     s->dsp.diff_pixels(temp, src1, src2, stride);
2142
2143     memcpy(bak, temp, 64*sizeof(int16_t));
2144
2145     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2146     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2147     ff_simple_idct_8(temp); //FIXME
2148
2149     for(i=0; i<64; i++)
2150         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2151
2152     return sum;
2153 }
2154
2155 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2156     MpegEncContext * const s= (MpegEncContext *)c;
2157     const uint8_t *scantable= s->intra_scantable.permutated;
2158     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2159     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2160     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2161     int i, last, run, bits, level, distortion, start_i;
2162     const int esc_length= s->ac_esc_length;
2163     uint8_t * length;
2164     uint8_t * last_length;
2165
2166     assert(h==8);
2167
2168     copy_block8(lsrc1, src1, 8, stride, 8);
2169     copy_block8(lsrc2, src2, 8, stride, 8);
2170
2171     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2172
2173     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2174
2175     bits=0;
2176
2177     if (s->mb_intra) {
2178         start_i = 1;
2179         length     = s->intra_ac_vlc_length;
2180         last_length= s->intra_ac_vlc_last_length;
2181         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2182     } else {
2183         start_i = 0;
2184         length     = s->inter_ac_vlc_length;
2185         last_length= s->inter_ac_vlc_last_length;
2186     }
2187
2188     if(last>=start_i){
2189         run=0;
2190         for(i=start_i; i<last; i++){
2191             int j= scantable[i];
2192             level= temp[j];
2193
2194             if(level){
2195                 level+=64;
2196                 if((level&(~127)) == 0){
2197                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2198                 }else
2199                     bits+= esc_length;
2200                 run=0;
2201             }else
2202                 run++;
2203         }
2204         i= scantable[last];
2205
2206         level= temp[i] + 64;
2207
2208         assert(level - 64);
2209
2210         if((level&(~127)) == 0){
2211             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2212         }else
2213             bits+= esc_length;
2214
2215     }
2216
2217     if(last>=0){
2218         if(s->mb_intra)
2219             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2220         else
2221             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2222     }
2223
2224     s->dsp.idct_add(lsrc2, 8, temp);
2225
2226     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2227
2228     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2229 }
2230
2231 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2232     MpegEncContext * const s= (MpegEncContext *)c;
2233     const uint8_t *scantable= s->intra_scantable.permutated;
2234     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2235     int i, last, run, bits, level, start_i;
2236     const int esc_length= s->ac_esc_length;
2237     uint8_t * length;
2238     uint8_t * last_length;
2239
2240     assert(h==8);
2241
2242     s->dsp.diff_pixels(temp, src1, src2, stride);
2243
2244     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2245
2246     bits=0;
2247
2248     if (s->mb_intra) {
2249         start_i = 1;
2250         length     = s->intra_ac_vlc_length;
2251         last_length= s->intra_ac_vlc_last_length;
2252         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2253     } else {
2254         start_i = 0;
2255         length     = s->inter_ac_vlc_length;
2256         last_length= s->inter_ac_vlc_last_length;
2257     }
2258
2259     if(last>=start_i){
2260         run=0;
2261         for(i=start_i; i<last; i++){
2262             int j= scantable[i];
2263             level= temp[j];
2264
2265             if(level){
2266                 level+=64;
2267                 if((level&(~127)) == 0){
2268                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2269                 }else
2270                     bits+= esc_length;
2271                 run=0;
2272             }else
2273                 run++;
2274         }
2275         i= scantable[last];
2276
2277         level= temp[i] + 64;
2278
2279         assert(level - 64);
2280
2281         if((level&(~127)) == 0){
2282             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2283         }else
2284             bits+= esc_length;
2285     }
2286
2287     return bits;
2288 }
2289
2290 #define VSAD_INTRA(size) \
2291 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2292     int score=0;                                                                                            \
2293     int x,y;                                                                                                \
2294                                                                                                             \
2295     for(y=1; y<h; y++){                                                                                     \
2296         for(x=0; x<size; x+=4){                                                                             \
2297             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2298                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2299         }                                                                                                   \
2300         s+= stride;                                                                                         \
2301     }                                                                                                       \
2302                                                                                                             \
2303     return score;                                                                                           \
2304 }
2305 VSAD_INTRA(8)
2306 VSAD_INTRA(16)
2307
2308 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2309     int score=0;
2310     int x,y;
2311
2312     for(y=1; y<h; y++){
2313         for(x=0; x<16; x++){
2314             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2315         }
2316         s1+= stride;
2317         s2+= stride;
2318     }
2319
2320     return score;
2321 }
2322
2323 #define SQ(a) ((a)*(a))
2324 #define VSSE_INTRA(size) \
2325 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2326     int score=0;                                                                                            \
2327     int x,y;                                                                                                \
2328                                                                                                             \
2329     for(y=1; y<h; y++){                                                                                     \
2330         for(x=0; x<size; x+=4){                                                                               \
2331             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2332                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2333         }                                                                                                   \
2334         s+= stride;                                                                                         \
2335     }                                                                                                       \
2336                                                                                                             \
2337     return score;                                                                                           \
2338 }
2339 VSSE_INTRA(8)
2340 VSSE_INTRA(16)
2341
2342 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2343     int score=0;
2344     int x,y;
2345
2346     for(y=1; y<h; y++){
2347         for(x=0; x<16; x++){
2348             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2349         }
2350         s1+= stride;
2351         s2+= stride;
2352     }
2353
2354     return score;
2355 }
2356
2357 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2358                                int size){
2359     int score=0;
2360     int i;
2361     for(i=0; i<size; i++)
2362         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2363     return score;
2364 }
2365
2366 #define WRAPPER8_16_SQ(name8, name16)\
2367 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2368     int score=0;\
2369     score +=name8(s, dst           , src           , stride, 8);\
2370     score +=name8(s, dst+8         , src+8         , stride, 8);\
2371     if(h==16){\
2372         dst += 8*stride;\
2373         src += 8*stride;\
2374         score +=name8(s, dst           , src           , stride, 8);\
2375         score +=name8(s, dst+8         , src+8         , stride, 8);\
2376     }\
2377     return score;\
2378 }
2379
2380 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2381 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2382 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2383 #if CONFIG_GPL
2384 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2385 #endif
2386 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2387 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2388 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2389 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2390
2391 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2392                    uint32_t maxi, uint32_t maxisign)
2393 {
2394
2395     if(a > mini) return mini;
2396     else if((a^(1U<<31)) > maxisign) return maxi;
2397     else return a;
2398 }
2399
2400 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2401     int i;
2402     uint32_t mini = *(uint32_t*)min;
2403     uint32_t maxi = *(uint32_t*)max;
2404     uint32_t maxisign = maxi ^ (1U<<31);
2405     uint32_t *dsti = (uint32_t*)dst;
2406     const uint32_t *srci = (const uint32_t*)src;
2407     for(i=0; i<len; i+=8) {
2408         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2409         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2410         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2411         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2412         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2413         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2414         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2415         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2416     }
2417 }
2418 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2419     int i;
2420     if(min < 0 && max > 0) {
2421         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2422     } else {
2423         for(i=0; i < len; i+=8) {
2424             dst[i    ] = av_clipf(src[i    ], min, max);
2425             dst[i + 1] = av_clipf(src[i + 1], min, max);
2426             dst[i + 2] = av_clipf(src[i + 2], min, max);
2427             dst[i + 3] = av_clipf(src[i + 3], min, max);
2428             dst[i + 4] = av_clipf(src[i + 4], min, max);
2429             dst[i + 5] = av_clipf(src[i + 5], min, max);
2430             dst[i + 6] = av_clipf(src[i + 6], min, max);
2431             dst[i + 7] = av_clipf(src[i + 7], min, max);
2432         }
2433     }
2434 }
2435
2436 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2437 {
2438     int res = 0;
2439
2440     while (order--)
2441         res += *v1++ * *v2++;
2442
2443     return res;
2444 }
2445
2446 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2447 {
2448     int res = 0;
2449     while (order--) {
2450         res   += *v1 * *v2++;
2451         *v1++ += mul * *v3++;
2452     }
2453     return res;
2454 }
2455
2456 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2457                                  const int16_t *window, unsigned int len)
2458 {
2459     int i;
2460     int len2 = len >> 1;
2461
2462     for (i = 0; i < len2; i++) {
2463         int16_t w       = window[i];
2464         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2465         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2466     }
2467 }
2468
2469 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2470                                 int32_t max, unsigned int len)
2471 {
2472     do {
2473         *dst++ = av_clip(*src++, min, max);
2474         *dst++ = av_clip(*src++, min, max);
2475         *dst++ = av_clip(*src++, min, max);
2476         *dst++ = av_clip(*src++, min, max);
2477         *dst++ = av_clip(*src++, min, max);
2478         *dst++ = av_clip(*src++, min, max);
2479         *dst++ = av_clip(*src++, min, max);
2480         *dst++ = av_clip(*src++, min, max);
2481         len -= 8;
2482     } while (len > 0);
2483 }
2484
2485 static void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2486 {
2487     ff_j_rev_dct (block);
2488     put_pixels_clamped_c(block, dest, line_size);
2489 }
2490 static void ff_jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2491 {
2492     ff_j_rev_dct (block);
2493     add_pixels_clamped_c(block, dest, line_size);
2494 }
2495
2496 /* init static data */
2497 av_cold void ff_dsputil_static_init(void)
2498 {
2499     int i;
2500
2501     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2502     for(i=0;i<MAX_NEG_CROP;i++) {
2503         ff_cropTbl[i] = 0;
2504         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2505     }
2506
2507     for(i=0;i<512;i++) {
2508         ff_squareTbl[i] = (i - 256) * (i - 256);
2509     }
2510
2511     for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2512 }
2513
2514 int ff_check_alignment(void){
2515     static int did_fail=0;
2516     LOCAL_ALIGNED_16(int, aligned, [4]);
2517
2518     if((intptr_t)aligned & 15){
2519         if(!did_fail){
2520 #if HAVE_MMX || HAVE_ALTIVEC
2521             av_log(NULL, AV_LOG_ERROR,
2522                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2523                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2524                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2525                 "Do not report crashes to Libav developers.\n");
2526 #endif
2527             did_fail=1;
2528         }
2529         return -1;
2530     }
2531     return 0;
2532 }
2533
2534 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2535 {
2536     ff_check_alignment();
2537
2538 #if CONFIG_ENCODERS
2539     if (avctx->bits_per_raw_sample == 10) {
2540         c->fdct    = ff_jpeg_fdct_islow_10;
2541         c->fdct248 = ff_fdct248_islow_10;
2542     } else {
2543         if(avctx->dct_algo==FF_DCT_FASTINT) {
2544             c->fdct    = ff_fdct_ifast;
2545             c->fdct248 = ff_fdct_ifast248;
2546         }
2547         else if(avctx->dct_algo==FF_DCT_FAAN) {
2548             c->fdct    = ff_faandct;
2549             c->fdct248 = ff_faandct248;
2550         }
2551         else {
2552             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2553             c->fdct248 = ff_fdct248_islow_8;
2554         }
2555     }
2556 #endif //CONFIG_ENCODERS
2557
2558     if (avctx->bits_per_raw_sample == 10) {
2559         c->idct_put              = ff_simple_idct_put_10;
2560         c->idct_add              = ff_simple_idct_add_10;
2561         c->idct                  = ff_simple_idct_10;
2562         c->idct_permutation_type = FF_NO_IDCT_PERM;
2563     } else {
2564         if(avctx->idct_algo==FF_IDCT_INT){
2565             c->idct_put= ff_jref_idct_put;
2566             c->idct_add= ff_jref_idct_add;
2567             c->idct    = ff_j_rev_dct;
2568             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2569         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2570             c->idct_put= ff_faanidct_put;
2571             c->idct_add= ff_faanidct_add;
2572             c->idct    = ff_faanidct;
2573             c->idct_permutation_type= FF_NO_IDCT_PERM;
2574         }else{ //accurate/default
2575             c->idct_put = ff_simple_idct_put_8;
2576             c->idct_add = ff_simple_idct_add_8;
2577             c->idct     = ff_simple_idct_8;
2578             c->idct_permutation_type= FF_NO_IDCT_PERM;
2579         }
2580     }
2581
2582     c->diff_pixels = diff_pixels_c;
2583     c->put_pixels_clamped = put_pixels_clamped_c;
2584     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2585     c->add_pixels_clamped = add_pixels_clamped_c;
2586     c->sum_abs_dctelem = sum_abs_dctelem_c;
2587     c->gmc1 = gmc1_c;
2588     c->gmc = ff_gmc_c;
2589     c->pix_sum = pix_sum_c;
2590     c->pix_norm1 = pix_norm1_c;
2591
2592     c->fill_block_tab[0] = fill_block16_c;
2593     c->fill_block_tab[1] = fill_block8_c;
2594
2595     /* TODO [0] 16  [1] 8 */
2596     c->pix_abs[0][0] = pix_abs16_c;
2597     c->pix_abs[0][1] = pix_abs16_x2_c;
2598     c->pix_abs[0][2] = pix_abs16_y2_c;
2599     c->pix_abs[0][3] = pix_abs16_xy2_c;
2600     c->pix_abs[1][0] = pix_abs8_c;
2601     c->pix_abs[1][1] = pix_abs8_x2_c;
2602     c->pix_abs[1][2] = pix_abs8_y2_c;
2603     c->pix_abs[1][3] = pix_abs8_xy2_c;
2604
2605     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2606     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2607     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2608     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2609     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2610     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2611     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2612     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2613     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2614
2615     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2616     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2617     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2618     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2619     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2620     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2621     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2622     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2623     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2624
2625 #define dspfunc(PFX, IDX, NUM) \
2626     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2627     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2628     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2629     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2630     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2631     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2632     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2633     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2634     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2635     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2636     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2637     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2638     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2639     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2640     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2641     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2642
2643     dspfunc(put_qpel, 0, 16);
2644     dspfunc(put_no_rnd_qpel, 0, 16);
2645
2646     dspfunc(avg_qpel, 0, 16);
2647     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2648
2649     dspfunc(put_qpel, 1, 8);
2650     dspfunc(put_no_rnd_qpel, 1, 8);
2651
2652     dspfunc(avg_qpel, 1, 8);
2653     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2654
2655 #undef dspfunc
2656
2657     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2658     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2659     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2660     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2661     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2662     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2663     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2664     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2665
2666 #define SET_CMP_FUNC(name) \
2667     c->name[0]= name ## 16_c;\
2668     c->name[1]= name ## 8x8_c;
2669
2670     SET_CMP_FUNC(hadamard8_diff)
2671     c->hadamard8_diff[4]= hadamard8_intra16_c;
2672     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2673     SET_CMP_FUNC(dct_sad)
2674     SET_CMP_FUNC(dct_max)
2675 #if CONFIG_GPL
2676     SET_CMP_FUNC(dct264_sad)
2677 #endif
2678     c->sad[0]= pix_abs16_c;
2679     c->sad[1]= pix_abs8_c;
2680     c->sse[0]= sse16_c;
2681     c->sse[1]= sse8_c;
2682     c->sse[2]= sse4_c;
2683     SET_CMP_FUNC(quant_psnr)
2684     SET_CMP_FUNC(rd)
2685     SET_CMP_FUNC(bit)
2686     c->vsad[0]= vsad16_c;
2687     c->vsad[4]= vsad_intra16_c;
2688     c->vsad[5]= vsad_intra8_c;
2689     c->vsse[0]= vsse16_c;
2690     c->vsse[4]= vsse_intra16_c;
2691     c->vsse[5]= vsse_intra8_c;
2692     c->nsse[0]= nsse16_c;
2693     c->nsse[1]= nsse8_c;
2694
2695     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2696
2697     c->add_bytes= add_bytes_c;
2698     c->diff_bytes= diff_bytes_c;
2699     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2700     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2701     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
2702     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2703     c->bswap_buf= bswap_buf;
2704     c->bswap16_buf = bswap16_buf;
2705
2706     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2707         c->h263_h_loop_filter= h263_h_loop_filter_c;
2708         c->h263_v_loop_filter= h263_v_loop_filter_c;
2709     }
2710
2711     c->try_8x8basis= try_8x8basis_c;
2712     c->add_8x8basis= add_8x8basis_c;
2713
2714     c->vector_clipf = vector_clipf_c;
2715     c->scalarproduct_int16 = scalarproduct_int16_c;
2716     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2717     c->apply_window_int16 = apply_window_int16_c;
2718     c->vector_clip_int32 = vector_clip_int32_c;
2719
2720     c->shrink[0]= av_image_copy_plane;
2721     c->shrink[1]= ff_shrink22;
2722     c->shrink[2]= ff_shrink44;
2723     c->shrink[3]= ff_shrink88;
2724
2725     c->add_pixels8 = add_pixels8_c;
2726
2727 #define hpel_funcs(prefix, idx, num) \
2728     c->prefix ## _pixels_tab idx [0] = prefix ## _pixels ## num ## _8_c; \
2729     c->prefix ## _pixels_tab idx [1] = prefix ## _pixels ## num ## _x2_8_c; \
2730     c->prefix ## _pixels_tab idx [2] = prefix ## _pixels ## num ## _y2_8_c; \
2731     c->prefix ## _pixels_tab idx [3] = prefix ## _pixels ## num ## _xy2_8_c
2732
2733     hpel_funcs(put, [0], 16);
2734     hpel_funcs(put, [1],  8);
2735     hpel_funcs(put, [2],  4);
2736     hpel_funcs(put, [3],  2);
2737     hpel_funcs(put_no_rnd, [0], 16);
2738     hpel_funcs(put_no_rnd, [1],  8);
2739     hpel_funcs(avg, [0], 16);
2740     hpel_funcs(avg, [1],  8);
2741     hpel_funcs(avg, [2],  4);
2742     hpel_funcs(avg, [3],  2);
2743     hpel_funcs(avg_no_rnd,, 16);
2744
2745 #undef FUNC
2746 #undef FUNCC
2747 #define FUNC(f, depth) f ## _ ## depth
2748 #define FUNCC(f, depth) f ## _ ## depth ## _c
2749
2750 #define BIT_DEPTH_FUNCS(depth, dct)\
2751     c->get_pixels                    = FUNCC(get_pixels   ## dct   , depth);\
2752     c->draw_edges                    = FUNCC(draw_edges            , depth);\
2753     c->clear_block                   = FUNCC(clear_block  ## dct   , depth);\
2754     c->clear_blocks                  = FUNCC(clear_blocks ## dct   , depth);\
2755
2756     switch (avctx->bits_per_raw_sample) {
2757     case 9:
2758         if (c->dct_bits == 32) {
2759             BIT_DEPTH_FUNCS(9, _32);
2760         } else {
2761             BIT_DEPTH_FUNCS(9, _16);
2762         }
2763         break;
2764     case 10:
2765         if (c->dct_bits == 32) {
2766             BIT_DEPTH_FUNCS(10, _32);
2767         } else {
2768             BIT_DEPTH_FUNCS(10, _16);
2769         }
2770         break;
2771     default:
2772         BIT_DEPTH_FUNCS(8, _16);
2773         break;
2774     }
2775
2776
2777     if (HAVE_MMX)        ff_dsputil_init_mmx   (c, avctx);
2778     if (ARCH_ARM)        ff_dsputil_init_arm   (c, avctx);
2779     if (HAVE_VIS)        ff_dsputil_init_vis   (c, avctx);
2780     if (ARCH_ALPHA)      ff_dsputil_init_alpha (c, avctx);
2781     if (ARCH_PPC)        ff_dsputil_init_ppc   (c, avctx);
2782     if (ARCH_SH4)        ff_dsputil_init_sh4   (c, avctx);
2783     if (ARCH_BFIN)       ff_dsputil_init_bfin  (c, avctx);
2784
2785     ff_init_scantable_permutation(c->idct_permutation,
2786                                   c->idct_permutation_type);
2787 }