git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "libavutil/internal.h"
  32 #include "avcodec.h"
  33 #include "copy_block.h"
  34 #include "dct.h"
  35 #include "dsputil.h"
  36 #include "simple_idct.h"
  37 #include "faandct.h"
  38 #include "faanidct.h"
  39 #include "imgconvert.h"
  40 #include "mathops.h"
  41 #include "mpegvideo.h"
  42 #include "config.h"
  43
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 #define BIT_DEPTH 16
  47 #include "dsputil_template.c"
  48 #undef BIT_DEPTH
  49
  50 #define BIT_DEPTH 8
  51 #include "dsputil_template.c"
  52
  53 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  54 #define pb_7f (~0UL/255 * 0x7f)
  55 #define pb_80 (~0UL/255 * 0x80)
  56
  57 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  58    specification, we interleave the fields */
  59 const uint8_t ff_zigzag248_direct[64] = {
  60      0,  8,  1,  9, 16, 24,  2, 10,
  61     17, 25, 32, 40, 48, 56, 33, 41,
  62     18, 26,  3, 11,  4, 12, 19, 27,
  63     34, 42, 49, 57, 50, 58, 35, 43,
  64     20, 28,  5, 13,  6, 14, 21, 29,
  65     36, 44, 51, 59, 52, 60, 37, 45,
  66     22, 30,  7, 15, 23, 31, 38, 46,
  67     53, 61, 54, 62, 39, 47, 55, 63,
  68 };
  69
  70 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  71 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
  72
  73 const uint8_t ff_alternate_horizontal_scan[64] = {
  74     0,  1,   2,  3,  8,  9, 16, 17,
  75     10, 11,  4,  5,  6,  7, 15, 14,
  76     13, 12, 19, 18, 24, 25, 32, 33,
  77     26, 27, 20, 21, 22, 23, 28, 29,
  78     30, 31, 34, 35, 40, 41, 48, 49,
  79     42, 43, 36, 37, 38, 39, 44, 45,
  80     46, 47, 50, 51, 56, 57, 58, 59,
  81     52, 53, 54, 55, 60, 61, 62, 63,
  82 };
  83
  84 const uint8_t ff_alternate_vertical_scan[64] = {
  85     0,  8,  16, 24,  1,  9,  2, 10,
  86     17, 25, 32, 40, 48, 56, 57, 49,
  87     41, 33, 26, 18,  3, 11,  4, 12,
  88     19, 27, 34, 42, 50, 58, 35, 43,
  89     51, 59, 20, 28,  5, 13,  6, 14,
  90     21, 29, 36, 44, 52, 60, 37, 45,
  91     53, 61, 22, 30,  7, 15, 23, 31,
  92     38, 46, 54, 62, 39, 47, 55, 63,
  93 };
  94
  95 /* Input permutation for the simple_idct_mmx */
  96 static const uint8_t simple_mmx_permutation[64]={
  97         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
  98         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
  99         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 100         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 101         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 102         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 103         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 104         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 105 };
 106
 107 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 108
 109 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 110     int i;
 111     int end;
 112
 113     st->scantable= src_scantable;
 114
 115     for(i=0; i<64; i++){
 116         int j;
 117         j = src_scantable[i];
 118         st->permutated[i] = permutation[j];
 119     }
 120
 121     end=-1;
 122     for(i=0; i<64; i++){
 123         int j;
 124         j = st->permutated[i];
 125         if(j>end) end=j;
 126         st->raster_end[i]= end;
 127     }
 128 }
 129
 130 void ff_init_scantable_permutation(uint8_t *idct_permutation,
 131                                    int idct_permutation_type)
 132 {
 133     int i;
 134
 135     switch(idct_permutation_type){
 136     case FF_NO_IDCT_PERM:
 137         for(i=0; i<64; i++)
 138             idct_permutation[i]= i;
 139         break;
 140     case FF_LIBMPEG2_IDCT_PERM:
 141         for(i=0; i<64; i++)
 142             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 143         break;
 144     case FF_SIMPLE_IDCT_PERM:
 145         for(i=0; i<64; i++)
 146             idct_permutation[i]= simple_mmx_permutation[i];
 147         break;
 148     case FF_TRANSPOSE_IDCT_PERM:
 149         for(i=0; i<64; i++)
 150             idct_permutation[i]= ((i&7)<<3) | (i>>3);
 151         break;
 152     case FF_PARTTRANS_IDCT_PERM:
 153         for(i=0; i<64; i++)
 154             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
 155         break;
 156     case FF_SSE2_IDCT_PERM:
 157         for(i=0; i<64; i++)
 158             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
 159         break;
 160     default:
 161         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
 162     }
 163 }
 164
 165 static int pix_sum_c(uint8_t * pix, int line_size)
 166 {
 167     int s, i, j;
 168
 169     s = 0;
 170     for (i = 0; i < 16; i++) {
 171         for (j = 0; j < 16; j += 8) {
 172             s += pix[0];
 173             s += pix[1];
 174             s += pix[2];
 175             s += pix[3];
 176             s += pix[4];
 177             s += pix[5];
 178             s += pix[6];
 179             s += pix[7];
 180             pix += 8;
 181         }
 182         pix += line_size - 16;
 183     }
 184     return s;
 185 }
 186
 187 static int pix_norm1_c(uint8_t * pix, int line_size)
 188 {
 189     int s, i, j;
 190     uint32_t *sq = ff_squareTbl + 256;
 191
 192     s = 0;
 193     for (i = 0; i < 16; i++) {
 194         for (j = 0; j < 16; j += 8) {
 195 #if 0
 196             s += sq[pix[0]];
 197             s += sq[pix[1]];
 198             s += sq[pix[2]];
 199             s += sq[pix[3]];
 200             s += sq[pix[4]];
 201             s += sq[pix[5]];
 202             s += sq[pix[6]];
 203             s += sq[pix[7]];
 204 #else
 205 #if HAVE_FAST_64BIT
 206             register uint64_t x=*(uint64_t*)pix;
 207             s += sq[x&0xff];
 208             s += sq[(x>>8)&0xff];
 209             s += sq[(x>>16)&0xff];
 210             s += sq[(x>>24)&0xff];
 211             s += sq[(x>>32)&0xff];
 212             s += sq[(x>>40)&0xff];
 213             s += sq[(x>>48)&0xff];
 214             s += sq[(x>>56)&0xff];
 215 #else
 216             register uint32_t x=*(uint32_t*)pix;
 217             s += sq[x&0xff];
 218             s += sq[(x>>8)&0xff];
 219             s += sq[(x>>16)&0xff];
 220             s += sq[(x>>24)&0xff];
 221             x=*(uint32_t*)(pix+4);
 222             s += sq[x&0xff];
 223             s += sq[(x>>8)&0xff];
 224             s += sq[(x>>16)&0xff];
 225             s += sq[(x>>24)&0xff];
 226 #endif
 227 #endif
 228             pix += 8;
 229         }
 230         pix += line_size - 16;
 231     }
 232     return s;
 233 }
 234
 235 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 236     int i;
 237
 238     for(i=0; i+8<=w; i+=8){
 239         dst[i+0]= av_bswap32(src[i+0]);
 240         dst[i+1]= av_bswap32(src[i+1]);
 241         dst[i+2]= av_bswap32(src[i+2]);
 242         dst[i+3]= av_bswap32(src[i+3]);
 243         dst[i+4]= av_bswap32(src[i+4]);
 244         dst[i+5]= av_bswap32(src[i+5]);
 245         dst[i+6]= av_bswap32(src[i+6]);
 246         dst[i+7]= av_bswap32(src[i+7]);
 247     }
 248     for(;i<w; i++){
 249         dst[i+0]= av_bswap32(src[i+0]);
 250     }
 251 }
 252
 253 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 254 {
 255     while (len--)
 256         *dst++ = av_bswap16(*src++);
 257 }
 258
 259 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 260 {
 261     int s, i;
 262     uint32_t *sq = ff_squareTbl + 256;
 263
 264     s = 0;
 265     for (i = 0; i < h; i++) {
 266         s += sq[pix1[0] - pix2[0]];
 267         s += sq[pix1[1] - pix2[1]];
 268         s += sq[pix1[2] - pix2[2]];
 269         s += sq[pix1[3] - pix2[3]];
 270         pix1 += line_size;
 271         pix2 += line_size;
 272     }
 273     return s;
 274 }
 275
 276 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 277 {
 278     int s, i;
 279     uint32_t *sq = ff_squareTbl + 256;
 280
 281     s = 0;
 282     for (i = 0; i < h; i++) {
 283         s += sq[pix1[0] - pix2[0]];
 284         s += sq[pix1[1] - pix2[1]];
 285         s += sq[pix1[2] - pix2[2]];
 286         s += sq[pix1[3] - pix2[3]];
 287         s += sq[pix1[4] - pix2[4]];
 288         s += sq[pix1[5] - pix2[5]];
 289         s += sq[pix1[6] - pix2[6]];
 290         s += sq[pix1[7] - pix2[7]];
 291         pix1 += line_size;
 292         pix2 += line_size;
 293     }
 294     return s;
 295 }
 296
 297 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 298 {
 299     int s, i;
 300     uint32_t *sq = ff_squareTbl + 256;
 301
 302     s = 0;
 303     for (i = 0; i < h; i++) {
 304         s += sq[pix1[ 0] - pix2[ 0]];
 305         s += sq[pix1[ 1] - pix2[ 1]];
 306         s += sq[pix1[ 2] - pix2[ 2]];
 307         s += sq[pix1[ 3] - pix2[ 3]];
 308         s += sq[pix1[ 4] - pix2[ 4]];
 309         s += sq[pix1[ 5] - pix2[ 5]];
 310         s += sq[pix1[ 6] - pix2[ 6]];
 311         s += sq[pix1[ 7] - pix2[ 7]];
 312         s += sq[pix1[ 8] - pix2[ 8]];
 313         s += sq[pix1[ 9] - pix2[ 9]];
 314         s += sq[pix1[10] - pix2[10]];
 315         s += sq[pix1[11] - pix2[11]];
 316         s += sq[pix1[12] - pix2[12]];
 317         s += sq[pix1[13] - pix2[13]];
 318         s += sq[pix1[14] - pix2[14]];
 319         s += sq[pix1[15] - pix2[15]];
 320
 321         pix1 += line_size;
 322         pix2 += line_size;
 323     }
 324     return s;
 325 }
 326
 327 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
 328                           const uint8_t *s2, int stride){
 329     int i;
 330
 331     /* read the pixels */
 332     for(i=0;i<8;i++) {
 333         block[0] = s1[0] - s2[0];
 334         block[1] = s1[1] - s2[1];
 335         block[2] = s1[2] - s2[2];
 336         block[3] = s1[3] - s2[3];
 337         block[4] = s1[4] - s2[4];
 338         block[5] = s1[5] - s2[5];
 339         block[6] = s1[6] - s2[6];
 340         block[7] = s1[7] - s2[7];
 341         s1 += stride;
 342         s2 += stride;
 343         block += 8;
 344     }
 345 }
 346
 347
 348 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 349                                  int line_size)
 350 {
 351     int i;
 352
 353     /* read the pixels */
 354     for(i=0;i<8;i++) {
 355         pixels[0] = av_clip_uint8(block[0]);
 356         pixels[1] = av_clip_uint8(block[1]);
 357         pixels[2] = av_clip_uint8(block[2]);
 358         pixels[3] = av_clip_uint8(block[3]);
 359         pixels[4] = av_clip_uint8(block[4]);
 360         pixels[5] = av_clip_uint8(block[5]);
 361         pixels[6] = av_clip_uint8(block[6]);
 362         pixels[7] = av_clip_uint8(block[7]);
 363
 364         pixels += line_size;
 365         block += 8;
 366     }
 367 }
 368
 369 static void put_signed_pixels_clamped_c(const int16_t *block,
 370                                         uint8_t *restrict pixels,
 371                                         int line_size)
 372 {
 373     int i, j;
 374
 375     for (i = 0; i < 8; i++) {
 376         for (j = 0; j < 8; j++) {
 377             if (*block < -128)
 378                 *pixels = 0;
 379             else if (*block > 127)
 380                 *pixels = 255;
 381             else
 382                 *pixels = (uint8_t)(*block + 128);
 383             block++;
 384             pixels++;
 385         }
 386         pixels += (line_size - 8);
 387     }
 388 }
 389
 390 static void add_pixels8_c(uint8_t *restrict pixels,
 391                           int16_t *block,
 392                           int line_size)
 393 {
 394     int i;
 395
 396     for(i=0;i<8;i++) {
 397         pixels[0] += block[0];
 398         pixels[1] += block[1];
 399         pixels[2] += block[2];
 400         pixels[3] += block[3];
 401         pixels[4] += block[4];
 402         pixels[5] += block[5];
 403         pixels[6] += block[6];
 404         pixels[7] += block[7];
 405         pixels += line_size;
 406         block += 8;
 407     }
 408 }
 409
 410 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 411                                  int line_size)
 412 {
 413     int i;
 414
 415     /* read the pixels */
 416     for(i=0;i<8;i++) {
 417         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 418         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 419         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 420         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 421         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
 422         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
 423         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
 424         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
 425         pixels += line_size;
 426         block += 8;
 427     }
 428 }
 429
 430 static int sum_abs_dctelem_c(int16_t *block)
 431 {
 432     int sum=0, i;
 433     for(i=0; i<64; i++)
 434         sum+= FFABS(block[i]);
 435     return sum;
 436 }
 437
 438 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 439 {
 440     int i;
 441
 442     for (i = 0; i < h; i++) {
 443         memset(block, value, 16);
 444         block += line_size;
 445     }
 446 }
 447
 448 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 449 {
 450     int i;
 451
 452     for (i = 0; i < h; i++) {
 453         memset(block, value, 8);
 454         block += line_size;
 455     }
 456 }
 457
 458 #define avg2(a,b) ((a+b+1)>>1)
 459 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 460
 461 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 462 {
 463     const int A=(16-x16)*(16-y16);
 464     const int B=(   x16)*(16-y16);
 465     const int C=(16-x16)*(   y16);
 466     const int D=(   x16)*(   y16);
 467     int i;
 468
 469     for(i=0; i<h; i++)
 470     {
 471         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 472         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 473         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 474         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 475         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 476         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 477         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 478         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 479         dst+= stride;
 480         src+= stride;
 481     }
 482 }
 483
 484 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 485                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 486 {
 487     int y, vx, vy;
 488     const int s= 1<<shift;
 489
 490     width--;
 491     height--;
 492
 493     for(y=0; y<h; y++){
 494         int x;
 495
 496         vx= ox;
 497         vy= oy;
 498         for(x=0; x<8; x++){ //XXX FIXME optimize
 499             int src_x, src_y, frac_x, frac_y, index;
 500
 501             src_x= vx>>16;
 502             src_y= vy>>16;
 503             frac_x= src_x&(s-1);
 504             frac_y= src_y&(s-1);
 505             src_x>>=shift;
 506             src_y>>=shift;
 507
 508             if((unsigned)src_x < width){
 509                 if((unsigned)src_y < height){
 510                     index= src_x + src_y*stride;
 511                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 512                                            + src[index       +1]*   frac_x )*(s-frac_y)
 513                                         + (  src[index+stride  ]*(s-frac_x)
 514                                            + src[index+stride+1]*   frac_x )*   frac_y
 515                                         + r)>>(shift*2);
 516                 }else{
 517                     index= src_x + av_clip(src_y, 0, height)*stride;
 518                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 519                                           + src[index       +1]*   frac_x )*s
 520                                         + r)>>(shift*2);
 521                 }
 522             }else{
 523                 if((unsigned)src_y < height){
 524                     index= av_clip(src_x, 0, width) + src_y*stride;
 525                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 526                                            + src[index+stride  ]*   frac_y )*s
 527                                         + r)>>(shift*2);
 528                 }else{
 529                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 530                     dst[y*stride + x]=    src[index         ];
 531                 }
 532             }
 533
 534             vx+= dxx;
 535             vy+= dyx;
 536         }
 537         ox += dxy;
 538         oy += dyy;
 539     }
 540 }
 541
 542 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 543     switch(width){
 544     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 545     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 546     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 547     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 548     }
 549 }
 550
 551 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 552     int i,j;
 553     for (i=0; i < height; i++) {
 554       for (j=0; j < width; j++) {
 555         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 556       }
 557       src += stride;
 558       dst += stride;
 559     }
 560 }
 561
 562 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 563     int i,j;
 564     for (i=0; i < height; i++) {
 565       for (j=0; j < width; j++) {
 566         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 567       }
 568       src += stride;
 569       dst += stride;
 570     }
 571 }
 572
 573 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 574     int i,j;
 575     for (i=0; i < height; i++) {
 576       for (j=0; j < width; j++) {
 577         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 578       }
 579       src += stride;
 580       dst += stride;
 581     }
 582 }
 583
 584 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 585     int i,j;
 586     for (i=0; i < height; i++) {
 587       for (j=0; j < width; j++) {
 588         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 589       }
 590       src += stride;
 591       dst += stride;
 592     }
 593 }
 594
 595 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 596     int i,j;
 597     for (i=0; i < height; i++) {
 598       for (j=0; j < width; j++) {
 599         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 600       }
 601       src += stride;
 602       dst += stride;
 603     }
 604 }
 605
 606 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 607     int i,j;
 608     for (i=0; i < height; i++) {
 609       for (j=0; j < width; j++) {
 610         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 611       }
 612       src += stride;
 613       dst += stride;
 614     }
 615 }
 616
 617 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 618     int i,j;
 619     for (i=0; i < height; i++) {
 620       for (j=0; j < width; j++) {
 621         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 622       }
 623       src += stride;
 624       dst += stride;
 625     }
 626 }
 627
 628 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 629     int i,j;
 630     for (i=0; i < height; i++) {
 631       for (j=0; j < width; j++) {
 632         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 633       }
 634       src += stride;
 635       dst += stride;
 636     }
 637 }
 638
 639 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 640     switch(width){
 641     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 642     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 643     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 644     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 645     }
 646 }
 647
 648 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 649     int i,j;
 650     for (i=0; i < height; i++) {
 651       for (j=0; j < width; j++) {
 652         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 653       }
 654       src += stride;
 655       dst += stride;
 656     }
 657 }
 658
 659 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 660     int i,j;
 661     for (i=0; i < height; i++) {
 662       for (j=0; j < width; j++) {
 663         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 664       }
 665       src += stride;
 666       dst += stride;
 667     }
 668 }
 669
 670 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 671     int i,j;
 672     for (i=0; i < height; i++) {
 673       for (j=0; j < width; j++) {
 674         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 675       }
 676       src += stride;
 677       dst += stride;
 678     }
 679 }
 680
 681 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 682     int i,j;
 683     for (i=0; i < height; i++) {
 684       for (j=0; j < width; j++) {
 685         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 686       }
 687       src += stride;
 688       dst += stride;
 689     }
 690 }
 691
 692 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 693     int i,j;
 694     for (i=0; i < height; i++) {
 695       for (j=0; j < width; j++) {
 696         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 697       }
 698       src += stride;
 699       dst += stride;
 700     }
 701 }
 702
 703 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 704     int i,j;
 705     for (i=0; i < height; i++) {
 706       for (j=0; j < width; j++) {
 707         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 708       }
 709       src += stride;
 710       dst += stride;
 711     }
 712 }
 713
 714 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 715     int i,j;
 716     for (i=0; i < height; i++) {
 717       for (j=0; j < width; j++) {
 718         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 719       }
 720       src += stride;
 721       dst += stride;
 722     }
 723 }
 724
 725 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 726     int i,j;
 727     for (i=0; i < height; i++) {
 728       for (j=0; j < width; j++) {
 729         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 730       }
 731       src += stride;
 732       dst += stride;
 733     }
 734 }
 735
 736 #define QPEL_MC(r, OPNAME, RND, OP) \
 737 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 738     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 739     int i;\
 740     for(i=0; i<h; i++)\
 741     {\
 742         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 743         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 744         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 745         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 746         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 747         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 748         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 749         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 750         dst+=dstStride;\
 751         src+=srcStride;\
 752     }\
 753 }\
 754 \
 755 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 756     const int w=8;\
 757     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 758     int i;\
 759     for(i=0; i<w; i++)\
 760     {\
 761         const int src0= src[0*srcStride];\
 762         const int src1= src[1*srcStride];\
 763         const int src2= src[2*srcStride];\
 764         const int src3= src[3*srcStride];\
 765         const int src4= src[4*srcStride];\
 766         const int src5= src[5*srcStride];\
 767         const int src6= src[6*srcStride];\
 768         const int src7= src[7*srcStride];\
 769         const int src8= src[8*srcStride];\
 770         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 771         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 772         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 773         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 774         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 775         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 776         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 777         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 778         dst++;\
 779         src++;\
 780     }\
 781 }\
 782 \
 783 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 784     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 785     int i;\
 786     \
 787     for(i=0; i<h; i++)\
 788     {\
 789         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 790         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 791         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 792         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 793         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 794         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 795         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 796         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 797         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 798         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 799         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 800         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 801         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 802         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 803         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 804         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 805         dst+=dstStride;\
 806         src+=srcStride;\
 807     }\
 808 }\
 809 \
 810 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 811     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 812     int i;\
 813     const int w=16;\
 814     for(i=0; i<w; i++)\
 815     {\
 816         const int src0= src[0*srcStride];\
 817         const int src1= src[1*srcStride];\
 818         const int src2= src[2*srcStride];\
 819         const int src3= src[3*srcStride];\
 820         const int src4= src[4*srcStride];\
 821         const int src5= src[5*srcStride];\
 822         const int src6= src[6*srcStride];\
 823         const int src7= src[7*srcStride];\
 824         const int src8= src[8*srcStride];\
 825         const int src9= src[9*srcStride];\
 826         const int src10= src[10*srcStride];\
 827         const int src11= src[11*srcStride];\
 828         const int src12= src[12*srcStride];\
 829         const int src13= src[13*srcStride];\
 830         const int src14= src[14*srcStride];\
 831         const int src15= src[15*srcStride];\
 832         const int src16= src[16*srcStride];\
 833         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 834         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 835         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 836         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 837         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 838         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 839         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 840         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 841         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 842         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 843         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 844         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 845         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 846         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 847         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 848         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 849         dst++;\
 850         src++;\
 851     }\
 852 }\
 853 \
 854 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 855 {\
 856     uint8_t half[64];\
 857     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 858     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 859 }\
 860 \
 861 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 862 {\
 863     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 864 }\
 865 \
 866 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 867 {\
 868     uint8_t half[64];\
 869     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 870     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 871 }\
 872 \
 873 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 874 {\
 875     uint8_t full[16*9];\
 876     uint8_t half[64];\
 877     copy_block9(full, src, 16, stride, 9);\
 878     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 879     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 880 }\
 881 \
 882 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 883 {\
 884     uint8_t full[16*9];\
 885     copy_block9(full, src, 16, stride, 9);\
 886     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 887 }\
 888 \
 889 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 890 {\
 891     uint8_t full[16*9];\
 892     uint8_t half[64];\
 893     copy_block9(full, src, 16, stride, 9);\
 894     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 895     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 896 }\
 897 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 898 {\
 899     uint8_t full[16*9];\
 900     uint8_t halfH[72];\
 901     uint8_t halfV[64];\
 902     uint8_t halfHV[64];\
 903     copy_block9(full, src, 16, stride, 9);\
 904     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 905     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 906     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 907     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 908 }\
 909 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 910 {\
 911     uint8_t full[16*9];\
 912     uint8_t halfH[72];\
 913     uint8_t halfHV[64];\
 914     copy_block9(full, src, 16, stride, 9);\
 915     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 916     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 917     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 918     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 919 }\
 920 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 921 {\
 922     uint8_t full[16*9];\
 923     uint8_t halfH[72];\
 924     uint8_t halfV[64];\
 925     uint8_t halfHV[64];\
 926     copy_block9(full, src, 16, stride, 9);\
 927     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 928     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 929     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 930     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 931 }\
 932 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 933 {\
 934     uint8_t full[16*9];\
 935     uint8_t halfH[72];\
 936     uint8_t halfHV[64];\
 937     copy_block9(full, src, 16, stride, 9);\
 938     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 939     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 940     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 941     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 942 }\
 943 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 944 {\
 945     uint8_t full[16*9];\
 946     uint8_t halfH[72];\
 947     uint8_t halfV[64];\
 948     uint8_t halfHV[64];\
 949     copy_block9(full, src, 16, stride, 9);\
 950     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 951     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 952     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 953     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 954 }\
 955 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 956 {\
 957     uint8_t full[16*9];\
 958     uint8_t halfH[72];\
 959     uint8_t halfHV[64];\
 960     copy_block9(full, src, 16, stride, 9);\
 961     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 962     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 963     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 964     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 965 }\
 966 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 967 {\
 968     uint8_t full[16*9];\
 969     uint8_t halfH[72];\
 970     uint8_t halfV[64];\
 971     uint8_t halfHV[64];\
 972     copy_block9(full, src, 16, stride, 9);\
 973     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
 974     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 975     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 976     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 977 }\
 978 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 979 {\
 980     uint8_t full[16*9];\
 981     uint8_t halfH[72];\
 982     uint8_t halfHV[64];\
 983     copy_block9(full, src, 16, stride, 9);\
 984     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 985     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 986     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 987     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 988 }\
 989 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 990 {\
 991     uint8_t halfH[72];\
 992     uint8_t halfHV[64];\
 993     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
 994     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 995     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 996 }\
 997 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 998 {\
 999     uint8_t halfH[72];\
1000     uint8_t halfHV[64];\
1001     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1002     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1003     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1004 }\
1005 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1006 {\
1007     uint8_t full[16*9];\
1008     uint8_t halfH[72];\
1009     uint8_t halfV[64];\
1010     uint8_t halfHV[64];\
1011     copy_block9(full, src, 16, stride, 9);\
1012     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1013     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1014     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1015     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1016 }\
1017 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1018 {\
1019     uint8_t full[16*9];\
1020     uint8_t halfH[72];\
1021     copy_block9(full, src, 16, stride, 9);\
1022     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1023     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1024     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1025 }\
1026 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1027 {\
1028     uint8_t full[16*9];\
1029     uint8_t halfH[72];\
1030     uint8_t halfV[64];\
1031     uint8_t halfHV[64];\
1032     copy_block9(full, src, 16, stride, 9);\
1033     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1034     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1035     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1036     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1037 }\
1038 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1039 {\
1040     uint8_t full[16*9];\
1041     uint8_t halfH[72];\
1042     copy_block9(full, src, 16, stride, 9);\
1043     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1044     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1045     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1046 }\
1047 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1048 {\
1049     uint8_t halfH[72];\
1050     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1051     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1052 }\
1053 \
1054 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1055 {\
1056     uint8_t half[256];\
1057     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1058     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1059 }\
1060 \
1061 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1062 {\
1063     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1064 }\
1065 \
1066 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1067 {\
1068     uint8_t half[256];\
1069     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1070     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1071 }\
1072 \
1073 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1074 {\
1075     uint8_t full[24*17];\
1076     uint8_t half[256];\
1077     copy_block17(full, src, 24, stride, 17);\
1078     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1079     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1080 }\
1081 \
1082 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1083 {\
1084     uint8_t full[24*17];\
1085     copy_block17(full, src, 24, stride, 17);\
1086     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1087 }\
1088 \
1089 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1090 {\
1091     uint8_t full[24*17];\
1092     uint8_t half[256];\
1093     copy_block17(full, src, 24, stride, 17);\
1094     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1095     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1096 }\
1097 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1098 {\
1099     uint8_t full[24*17];\
1100     uint8_t halfH[272];\
1101     uint8_t halfV[256];\
1102     uint8_t halfHV[256];\
1103     copy_block17(full, src, 24, stride, 17);\
1104     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1105     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1106     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1107     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1108 }\
1109 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1110 {\
1111     uint8_t full[24*17];\
1112     uint8_t halfH[272];\
1113     uint8_t halfHV[256];\
1114     copy_block17(full, src, 24, stride, 17);\
1115     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1116     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1117     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1118     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1119 }\
1120 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1121 {\
1122     uint8_t full[24*17];\
1123     uint8_t halfH[272];\
1124     uint8_t halfV[256];\
1125     uint8_t halfHV[256];\
1126     copy_block17(full, src, 24, stride, 17);\
1127     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1128     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1129     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1130     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1131 }\
1132 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1133 {\
1134     uint8_t full[24*17];\
1135     uint8_t halfH[272];\
1136     uint8_t halfHV[256];\
1137     copy_block17(full, src, 24, stride, 17);\
1138     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1139     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1140     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1141     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1142 }\
1143 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1144 {\
1145     uint8_t full[24*17];\
1146     uint8_t halfH[272];\
1147     uint8_t halfV[256];\
1148     uint8_t halfHV[256];\
1149     copy_block17(full, src, 24, stride, 17);\
1150     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1151     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1152     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1153     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1154 }\
1155 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1156 {\
1157     uint8_t full[24*17];\
1158     uint8_t halfH[272];\
1159     uint8_t halfHV[256];\
1160     copy_block17(full, src, 24, stride, 17);\
1161     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1162     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1163     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1164     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1165 }\
1166 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1167 {\
1168     uint8_t full[24*17];\
1169     uint8_t halfH[272];\
1170     uint8_t halfV[256];\
1171     uint8_t halfHV[256];\
1172     copy_block17(full, src, 24, stride, 17);\
1173     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1174     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1175     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1176     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1177 }\
1178 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1179 {\
1180     uint8_t full[24*17];\
1181     uint8_t halfH[272];\
1182     uint8_t halfHV[256];\
1183     copy_block17(full, src, 24, stride, 17);\
1184     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1185     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1186     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1187     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1188 }\
1189 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1190 {\
1191     uint8_t halfH[272];\
1192     uint8_t halfHV[256];\
1193     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1194     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1195     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1196 }\
1197 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1198 {\
1199     uint8_t halfH[272];\
1200     uint8_t halfHV[256];\
1201     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1202     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1203     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1204 }\
1205 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1206 {\
1207     uint8_t full[24*17];\
1208     uint8_t halfH[272];\
1209     uint8_t halfV[256];\
1210     uint8_t halfHV[256];\
1211     copy_block17(full, src, 24, stride, 17);\
1212     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1213     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1214     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1215     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1216 }\
1217 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1218 {\
1219     uint8_t full[24*17];\
1220     uint8_t halfH[272];\
1221     copy_block17(full, src, 24, stride, 17);\
1222     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1223     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1224     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1225 }\
1226 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1227 {\
1228     uint8_t full[24*17];\
1229     uint8_t halfH[272];\
1230     uint8_t halfV[256];\
1231     uint8_t halfHV[256];\
1232     copy_block17(full, src, 24, stride, 17);\
1233     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1234     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1235     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1236     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1237 }\
1238 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1239 {\
1240     uint8_t full[24*17];\
1241     uint8_t halfH[272];\
1242     copy_block17(full, src, 24, stride, 17);\
1243     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1244     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1245     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1246 }\
1247 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1248 {\
1249     uint8_t halfH[272];\
1250     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1251     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1252 }
1253
1254 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1255 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1256 #define op_put(a, b) a = cm[((b) + 16)>>5]
1257 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1258
1259 QPEL_MC(0, put_       , _       , op_put)
1260 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1261 QPEL_MC(0, avg_       , _       , op_avg)
1262 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1263 #undef op_avg
1264 #undef op_avg_no_rnd
1265 #undef op_put
1266 #undef op_put_no_rnd
1267
1268 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1269 {
1270     put_pixels8_8_c(dst, src, stride, 8);
1271 }
1272 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1273 {
1274     avg_pixels8_8_c(dst, src, stride, 8);
1275 }
1276 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1277 {
1278     put_pixels16_8_c(dst, src, stride, 16);
1279 }
1280 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1281 {
1282     avg_pixels16_8_c(dst, src, stride, 16);
1283 }
1284
1285 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1286 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1287 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1288 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1289 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1290 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1291
1292 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1293     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1294     int i;
1295
1296     for(i=0; i<h; i++){
1297         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1298         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1299         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1300         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1301         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1302         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1303         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1304         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1305         dst+=dstStride;
1306         src+=srcStride;
1307     }
1308 }
1309
1310 #if CONFIG_RV40_DECODER
1311 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1312 {
1313     put_pixels16_xy2_8_c(dst, src, stride, 16);
1314 }
1315 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1316 {
1317     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1318 }
1319 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1320 {
1321     put_pixels8_xy2_8_c(dst, src, stride, 8);
1322 }
1323 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1324 {
1325     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1326 }
1327 #endif /* CONFIG_RV40_DECODER */
1328
1329 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1330     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1331     int i;
1332
1333     for(i=0; i<w; i++){
1334         const int src_1= src[ -srcStride];
1335         const int src0 = src[0          ];
1336         const int src1 = src[  srcStride];
1337         const int src2 = src[2*srcStride];
1338         const int src3 = src[3*srcStride];
1339         const int src4 = src[4*srcStride];
1340         const int src5 = src[5*srcStride];
1341         const int src6 = src[6*srcStride];
1342         const int src7 = src[7*srcStride];
1343         const int src8 = src[8*srcStride];
1344         const int src9 = src[9*srcStride];
1345         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1346         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1347         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1348         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1349         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1350         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1351         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1352         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1353         src++;
1354         dst++;
1355     }
1356 }
1357
1358 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1359 {
1360     uint8_t half[64];
1361     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1362     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1363 }
1364
1365 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1366 {
1367     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1368 }
1369
1370 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1371 {
1372     uint8_t half[64];
1373     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1374     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1375 }
1376
1377 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1378 {
1379     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1380 }
1381
1382 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1383 {
1384     uint8_t halfH[88];
1385     uint8_t halfV[64];
1386     uint8_t halfHV[64];
1387     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1388     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1389     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1390     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1391 }
1392 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1393 {
1394     uint8_t halfH[88];
1395     uint8_t halfV[64];
1396     uint8_t halfHV[64];
1397     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1398     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1399     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1400     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1401 }
1402 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1403 {
1404     uint8_t halfH[88];
1405     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1406     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1407 }
1408
1409 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1410     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1411     int x;
1412     const int strength= ff_h263_loop_filter_strength[qscale];
1413
1414     for(x=0; x<8; x++){
1415         int d1, d2, ad1;
1416         int p0= src[x-2*stride];
1417         int p1= src[x-1*stride];
1418         int p2= src[x+0*stride];
1419         int p3= src[x+1*stride];
1420         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1421
1422         if     (d<-2*strength) d1= 0;
1423         else if(d<-  strength) d1=-2*strength - d;
1424         else if(d<   strength) d1= d;
1425         else if(d< 2*strength) d1= 2*strength - d;
1426         else                   d1= 0;
1427
1428         p1 += d1;
1429         p2 -= d1;
1430         if(p1&256) p1= ~(p1>>31);
1431         if(p2&256) p2= ~(p2>>31);
1432
1433         src[x-1*stride] = p1;
1434         src[x+0*stride] = p2;
1435
1436         ad1= FFABS(d1)>>1;
1437
1438         d2= av_clip((p0-p3)/4, -ad1, ad1);
1439
1440         src[x-2*stride] = p0 - d2;
1441         src[x+  stride] = p3 + d2;
1442     }
1443     }
1444 }
1445
1446 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1447     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1448     int y;
1449     const int strength= ff_h263_loop_filter_strength[qscale];
1450
1451     for(y=0; y<8; y++){
1452         int d1, d2, ad1;
1453         int p0= src[y*stride-2];
1454         int p1= src[y*stride-1];
1455         int p2= src[y*stride+0];
1456         int p3= src[y*stride+1];
1457         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1458
1459         if     (d<-2*strength) d1= 0;
1460         else if(d<-  strength) d1=-2*strength - d;
1461         else if(d<   strength) d1= d;
1462         else if(d< 2*strength) d1= 2*strength - d;
1463         else                   d1= 0;
1464
1465         p1 += d1;
1466         p2 -= d1;
1467         if(p1&256) p1= ~(p1>>31);
1468         if(p2&256) p2= ~(p2>>31);
1469
1470         src[y*stride-1] = p1;
1471         src[y*stride+0] = p2;
1472
1473         ad1= FFABS(d1)>>1;
1474
1475         d2= av_clip((p0-p3)/4, -ad1, ad1);
1476
1477         src[y*stride-2] = p0 - d2;
1478         src[y*stride+1] = p3 + d2;
1479     }
1480     }
1481 }
1482
1483 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1484 {
1485     int s, i;
1486
1487     s = 0;
1488     for(i=0;i<h;i++) {
1489         s += abs(pix1[0] - pix2[0]);
1490         s += abs(pix1[1] - pix2[1]);
1491         s += abs(pix1[2] - pix2[2]);
1492         s += abs(pix1[3] - pix2[3]);
1493         s += abs(pix1[4] - pix2[4]);
1494         s += abs(pix1[5] - pix2[5]);
1495         s += abs(pix1[6] - pix2[6]);
1496         s += abs(pix1[7] - pix2[7]);
1497         s += abs(pix1[8] - pix2[8]);
1498         s += abs(pix1[9] - pix2[9]);
1499         s += abs(pix1[10] - pix2[10]);
1500         s += abs(pix1[11] - pix2[11]);
1501         s += abs(pix1[12] - pix2[12]);
1502         s += abs(pix1[13] - pix2[13]);
1503         s += abs(pix1[14] - pix2[14]);
1504         s += abs(pix1[15] - pix2[15]);
1505         pix1 += line_size;
1506         pix2 += line_size;
1507     }
1508     return s;
1509 }
1510
1511 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1512 {
1513     int s, i;
1514
1515     s = 0;
1516     for(i=0;i<h;i++) {
1517         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1518         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1519         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1520         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1521         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1522         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1523         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1524         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1525         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1526         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1527         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1528         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1529         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1530         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1531         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1532         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1533         pix1 += line_size;
1534         pix2 += line_size;
1535     }
1536     return s;
1537 }
1538
1539 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1540 {
1541     int s, i;
1542     uint8_t *pix3 = pix2 + line_size;
1543
1544     s = 0;
1545     for(i=0;i<h;i++) {
1546         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1547         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1548         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1549         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1550         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1551         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1552         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1553         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1554         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1555         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1556         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1557         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1558         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1559         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1560         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1561         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1562         pix1 += line_size;
1563         pix2 += line_size;
1564         pix3 += line_size;
1565     }
1566     return s;
1567 }
1568
1569 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1570 {
1571     int s, i;
1572     uint8_t *pix3 = pix2 + line_size;
1573
1574     s = 0;
1575     for(i=0;i<h;i++) {
1576         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1577         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1578         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1579         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1580         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1581         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1582         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1583         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1584         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1585         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1586         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1587         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1588         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1589         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1590         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1591         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1592         pix1 += line_size;
1593         pix2 += line_size;
1594         pix3 += line_size;
1595     }
1596     return s;
1597 }
1598
1599 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1600 {
1601     int s, i;
1602
1603     s = 0;
1604     for(i=0;i<h;i++) {
1605         s += abs(pix1[0] - pix2[0]);
1606         s += abs(pix1[1] - pix2[1]);
1607         s += abs(pix1[2] - pix2[2]);
1608         s += abs(pix1[3] - pix2[3]);
1609         s += abs(pix1[4] - pix2[4]);
1610         s += abs(pix1[5] - pix2[5]);
1611         s += abs(pix1[6] - pix2[6]);
1612         s += abs(pix1[7] - pix2[7]);
1613         pix1 += line_size;
1614         pix2 += line_size;
1615     }
1616     return s;
1617 }
1618
1619 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1620 {
1621     int s, i;
1622
1623     s = 0;
1624     for(i=0;i<h;i++) {
1625         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1626         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1627         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1628         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1629         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1630         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1631         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1632         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1633         pix1 += line_size;
1634         pix2 += line_size;
1635     }
1636     return s;
1637 }
1638
1639 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1640 {
1641     int s, i;
1642     uint8_t *pix3 = pix2 + line_size;
1643
1644     s = 0;
1645     for(i=0;i<h;i++) {
1646         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1647         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1648         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1649         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1650         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1651         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1652         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1653         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1654         pix1 += line_size;
1655         pix2 += line_size;
1656         pix3 += line_size;
1657     }
1658     return s;
1659 }
1660
1661 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1662 {
1663     int s, i;
1664     uint8_t *pix3 = pix2 + line_size;
1665
1666     s = 0;
1667     for(i=0;i<h;i++) {
1668         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1669         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1670         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1671         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1672         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1673         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1674         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1675         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1676         pix1 += line_size;
1677         pix2 += line_size;
1678         pix3 += line_size;
1679     }
1680     return s;
1681 }
1682
1683 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1684     MpegEncContext *c = v;
1685     int score1=0;
1686     int score2=0;
1687     int x,y;
1688
1689     for(y=0; y<h; y++){
1690         for(x=0; x<16; x++){
1691             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1692         }
1693         if(y+1<h){
1694             for(x=0; x<15; x++){
1695                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1696                              - s1[x+1] + s1[x+1+stride])
1697                         -FFABS(  s2[x  ] - s2[x  +stride]
1698                              - s2[x+1] + s2[x+1+stride]);
1699             }
1700         }
1701         s1+= stride;
1702         s2+= stride;
1703     }
1704
1705     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1706     else  return score1 + FFABS(score2)*8;
1707 }
1708
1709 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1710     MpegEncContext *c = v;
1711     int score1=0;
1712     int score2=0;
1713     int x,y;
1714
1715     for(y=0; y<h; y++){
1716         for(x=0; x<8; x++){
1717             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1718         }
1719         if(y+1<h){
1720             for(x=0; x<7; x++){
1721                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1722                              - s1[x+1] + s1[x+1+stride])
1723                         -FFABS(  s2[x  ] - s2[x  +stride]
1724                              - s2[x+1] + s2[x+1+stride]);
1725             }
1726         }
1727         s1+= stride;
1728         s2+= stride;
1729     }
1730
1731     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1732     else  return score1 + FFABS(score2)*8;
1733 }
1734
1735 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1736     int i;
1737     unsigned int sum=0;
1738
1739     for(i=0; i<8*8; i++){
1740         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1741         int w= weight[i];
1742         b>>= RECON_SHIFT;
1743         assert(-512<b && b<512);
1744
1745         sum += (w*b)*(w*b)>>4;
1746     }
1747     return sum>>2;
1748 }
1749
1750 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1751     int i;
1752
1753     for(i=0; i<8*8; i++){
1754         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1755     }
1756 }
1757
1758 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1759     return 0;
1760 }
1761
1762 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1763     int i;
1764
1765     memset(cmp, 0, sizeof(void*)*6);
1766
1767     for(i=0; i<6; i++){
1768         switch(type&0xFF){
1769         case FF_CMP_SAD:
1770             cmp[i]= c->sad[i];
1771             break;
1772         case FF_CMP_SATD:
1773             cmp[i]= c->hadamard8_diff[i];
1774             break;
1775         case FF_CMP_SSE:
1776             cmp[i]= c->sse[i];
1777             break;
1778         case FF_CMP_DCT:
1779             cmp[i]= c->dct_sad[i];
1780             break;
1781         case FF_CMP_DCT264:
1782             cmp[i]= c->dct264_sad[i];
1783             break;
1784         case FF_CMP_DCTMAX:
1785             cmp[i]= c->dct_max[i];
1786             break;
1787         case FF_CMP_PSNR:
1788             cmp[i]= c->quant_psnr[i];
1789             break;
1790         case FF_CMP_BIT:
1791             cmp[i]= c->bit[i];
1792             break;
1793         case FF_CMP_RD:
1794             cmp[i]= c->rd[i];
1795             break;
1796         case FF_CMP_VSAD:
1797             cmp[i]= c->vsad[i];
1798             break;
1799         case FF_CMP_VSSE:
1800             cmp[i]= c->vsse[i];
1801             break;
1802         case FF_CMP_ZERO:
1803             cmp[i]= zero_cmp;
1804             break;
1805         case FF_CMP_NSSE:
1806             cmp[i]= c->nsse[i];
1807             break;
1808         default:
1809             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1810         }
1811     }
1812 }
1813
1814 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1815     long i;
1816     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1817         long a = *(long*)(src+i);
1818         long b = *(long*)(dst+i);
1819         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1820     }
1821     for(; i<w; i++)
1822         dst[i+0] += src[i+0];
1823 }
1824
1825 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1826     long i;
1827 #if !HAVE_FAST_UNALIGNED
1828     if((long)src2 & (sizeof(long)-1)){
1829         for(i=0; i+7<w; i+=8){
1830             dst[i+0] = src1[i+0]-src2[i+0];
1831             dst[i+1] = src1[i+1]-src2[i+1];
1832             dst[i+2] = src1[i+2]-src2[i+2];
1833             dst[i+3] = src1[i+3]-src2[i+3];
1834             dst[i+4] = src1[i+4]-src2[i+4];
1835             dst[i+5] = src1[i+5]-src2[i+5];
1836             dst[i+6] = src1[i+6]-src2[i+6];
1837             dst[i+7] = src1[i+7]-src2[i+7];
1838         }
1839     }else
1840 #endif
1841     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1842         long a = *(long*)(src1+i);
1843         long b = *(long*)(src2+i);
1844         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1845     }
1846     for(; i<w; i++)
1847         dst[i+0] = src1[i+0]-src2[i+0];
1848 }
1849
1850 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1851     int i;
1852     uint8_t l, lt;
1853
1854     l= *left;
1855     lt= *left_top;
1856
1857     for(i=0; i<w; i++){
1858         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1859         lt= src1[i];
1860         dst[i]= l;
1861     }
1862
1863     *left= l;
1864     *left_top= lt;
1865 }
1866
1867 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1868     int i;
1869     uint8_t l, lt;
1870
1871     l= *left;
1872     lt= *left_top;
1873
1874     for(i=0; i<w; i++){
1875         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1876         lt= src1[i];
1877         l= src2[i];
1878         dst[i]= l - pred;
1879     }
1880
1881     *left= l;
1882     *left_top= lt;
1883 }
1884
1885 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1886     int i;
1887
1888     for(i=0; i<w-1; i++){
1889         acc+= src[i];
1890         dst[i]= acc;
1891         i++;
1892         acc+= src[i];
1893         dst[i]= acc;
1894     }
1895
1896     for(; i<w; i++){
1897         acc+= src[i];
1898         dst[i]= acc;
1899     }
1900
1901     return acc;
1902 }
1903
1904 #if HAVE_BIGENDIAN
1905 #define B 3
1906 #define G 2
1907 #define R 1
1908 #define A 0
1909 #else
1910 #define B 0
1911 #define G 1
1912 #define R 2
1913 #define A 3
1914 #endif
1915 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1916     int i;
1917     int r,g,b,a;
1918     r= *red;
1919     g= *green;
1920     b= *blue;
1921     a= *alpha;
1922
1923     for(i=0; i<w; i++){
1924         b+= src[4*i+B];
1925         g+= src[4*i+G];
1926         r+= src[4*i+R];
1927         a+= src[4*i+A];
1928
1929         dst[4*i+B]= b;
1930         dst[4*i+G]= g;
1931         dst[4*i+R]= r;
1932         dst[4*i+A]= a;
1933     }
1934
1935     *red= r;
1936     *green= g;
1937     *blue= b;
1938     *alpha= a;
1939 }
1940 #undef B
1941 #undef G
1942 #undef R
1943 #undef A
1944
1945 #define BUTTERFLY2(o1,o2,i1,i2) \
1946 o1= (i1)+(i2);\
1947 o2= (i1)-(i2);
1948
1949 #define BUTTERFLY1(x,y) \
1950 {\
1951     int a,b;\
1952     a= x;\
1953     b= y;\
1954     x= a+b;\
1955     y= a-b;\
1956 }
1957
1958 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1959
1960 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1961     int i;
1962     int temp[64];
1963     int sum=0;
1964
1965     assert(h==8);
1966
1967     for(i=0; i<8; i++){
1968         //FIXME try pointer walks
1969         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1970         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1971         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1972         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1973
1974         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1975         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1976         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1977         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1978
1979         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1980         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1981         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1982         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1983     }
1984
1985     for(i=0; i<8; i++){
1986         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1987         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1988         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1989         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1990
1991         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1992         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1993         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1994         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1995
1996         sum +=
1997              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1998             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1999             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2000             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2001     }
2002     return sum;
2003 }
2004
2005 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2006     int i;
2007     int temp[64];
2008     int sum=0;
2009
2010     assert(h==8);
2011
2012     for(i=0; i<8; i++){
2013         //FIXME try pointer walks
2014         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2015         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2016         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2017         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2018
2019         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2020         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2021         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2022         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2023
2024         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2025         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2026         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2027         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2028     }
2029
2030     for(i=0; i<8; i++){
2031         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2032         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2033         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2034         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2035
2036         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2037         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2038         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2039         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2040
2041         sum +=
2042              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2043             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2044             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2045             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2046     }
2047
2048     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2049
2050     return sum;
2051 }
2052
2053 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2054     MpegEncContext * const s= (MpegEncContext *)c;
2055     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2056
2057     assert(h==8);
2058
2059     s->dsp.diff_pixels(temp, src1, src2, stride);
2060     s->dsp.fdct(temp);
2061     return s->dsp.sum_abs_dctelem(temp);
2062 }
2063
2064 #if CONFIG_GPL
2065 #define DCT8_1D {\
2066     const int s07 = SRC(0) + SRC(7);\
2067     const int s16 = SRC(1) + SRC(6);\
2068     const int s25 = SRC(2) + SRC(5);\
2069     const int s34 = SRC(3) + SRC(4);\
2070     const int a0 = s07 + s34;\
2071     const int a1 = s16 + s25;\
2072     const int a2 = s07 - s34;\
2073     const int a3 = s16 - s25;\
2074     const int d07 = SRC(0) - SRC(7);\
2075     const int d16 = SRC(1) - SRC(6);\
2076     const int d25 = SRC(2) - SRC(5);\
2077     const int d34 = SRC(3) - SRC(4);\
2078     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2079     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2080     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2081     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2082     DST(0,  a0 + a1     ) ;\
2083     DST(1,  a4 + (a7>>2)) ;\
2084     DST(2,  a2 + (a3>>1)) ;\
2085     DST(3,  a5 + (a6>>2)) ;\
2086     DST(4,  a0 - a1     ) ;\
2087     DST(5,  a6 - (a5>>2)) ;\
2088     DST(6, (a2>>1) - a3 ) ;\
2089     DST(7, (a4>>2) - a7 ) ;\
2090 }
2091
2092 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2093     MpegEncContext * const s= (MpegEncContext *)c;
2094     int16_t dct[8][8];
2095     int i;
2096     int sum=0;
2097
2098     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2099
2100 #define SRC(x) dct[i][x]
2101 #define DST(x,v) dct[i][x]= v
2102     for( i = 0; i < 8; i++ )
2103         DCT8_1D
2104 #undef SRC
2105 #undef DST
2106
2107 #define SRC(x) dct[x][i]
2108 #define DST(x,v) sum += FFABS(v)
2109     for( i = 0; i < 8; i++ )
2110         DCT8_1D
2111 #undef SRC
2112 #undef DST
2113     return sum;
2114 }
2115 #endif
2116
2117 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2118     MpegEncContext * const s= (MpegEncContext *)c;
2119     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2120     int sum=0, i;
2121
2122     assert(h==8);
2123
2124     s->dsp.diff_pixels(temp, src1, src2, stride);
2125     s->dsp.fdct(temp);
2126
2127     for(i=0; i<64; i++)
2128         sum= FFMAX(sum, FFABS(temp[i]));
2129
2130     return sum;
2131 }
2132
2133 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2134     MpegEncContext * const s= (MpegEncContext *)c;
2135     LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2136     int16_t * const bak = temp+64;
2137     int sum=0, i;
2138
2139     assert(h==8);
2140     s->mb_intra=0;
2141
2142     s->dsp.diff_pixels(temp, src1, src2, stride);
2143
2144     memcpy(bak, temp, 64*sizeof(int16_t));
2145
2146     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2147     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2148     ff_simple_idct_8(temp); //FIXME
2149
2150     for(i=0; i<64; i++)
2151         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2152
2153     return sum;
2154 }
2155
2156 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2157     MpegEncContext * const s= (MpegEncContext *)c;
2158     const uint8_t *scantable= s->intra_scantable.permutated;
2159     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2160     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2161     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2162     int i, last, run, bits, level, distortion, start_i;
2163     const int esc_length= s->ac_esc_length;
2164     uint8_t * length;
2165     uint8_t * last_length;
2166
2167     assert(h==8);
2168
2169     copy_block8(lsrc1, src1, 8, stride, 8);
2170     copy_block8(lsrc2, src2, 8, stride, 8);
2171
2172     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2173
2174     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2175
2176     bits=0;
2177
2178     if (s->mb_intra) {
2179         start_i = 1;
2180         length     = s->intra_ac_vlc_length;
2181         last_length= s->intra_ac_vlc_last_length;
2182         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2183     } else {
2184         start_i = 0;
2185         length     = s->inter_ac_vlc_length;
2186         last_length= s->inter_ac_vlc_last_length;
2187     }
2188
2189     if(last>=start_i){
2190         run=0;
2191         for(i=start_i; i<last; i++){
2192             int j= scantable[i];
2193             level= temp[j];
2194
2195             if(level){
2196                 level+=64;
2197                 if((level&(~127)) == 0){
2198                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2199                 }else
2200                     bits+= esc_length;
2201                 run=0;
2202             }else
2203                 run++;
2204         }
2205         i= scantable[last];
2206
2207         level= temp[i] + 64;
2208
2209         assert(level - 64);
2210
2211         if((level&(~127)) == 0){
2212             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2213         }else
2214             bits+= esc_length;
2215
2216     }
2217
2218     if(last>=0){
2219         if(s->mb_intra)
2220             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2221         else
2222             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2223     }
2224
2225     s->dsp.idct_add(lsrc2, 8, temp);
2226
2227     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2228
2229     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2230 }
2231
2232 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2233     MpegEncContext * const s= (MpegEncContext *)c;
2234     const uint8_t *scantable= s->intra_scantable.permutated;
2235     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2236     int i, last, run, bits, level, start_i;
2237     const int esc_length= s->ac_esc_length;
2238     uint8_t * length;
2239     uint8_t * last_length;
2240
2241     assert(h==8);
2242
2243     s->dsp.diff_pixels(temp, src1, src2, stride);
2244
2245     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2246
2247     bits=0;
2248
2249     if (s->mb_intra) {
2250         start_i = 1;
2251         length     = s->intra_ac_vlc_length;
2252         last_length= s->intra_ac_vlc_last_length;
2253         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2254     } else {
2255         start_i = 0;
2256         length     = s->inter_ac_vlc_length;
2257         last_length= s->inter_ac_vlc_last_length;
2258     }
2259
2260     if(last>=start_i){
2261         run=0;
2262         for(i=start_i; i<last; i++){
2263             int j= scantable[i];
2264             level= temp[j];
2265
2266             if(level){
2267                 level+=64;
2268                 if((level&(~127)) == 0){
2269                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2270                 }else
2271                     bits+= esc_length;
2272                 run=0;
2273             }else
2274                 run++;
2275         }
2276         i= scantable[last];
2277
2278         level= temp[i] + 64;
2279
2280         assert(level - 64);
2281
2282         if((level&(~127)) == 0){
2283             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2284         }else
2285             bits+= esc_length;
2286     }
2287
2288     return bits;
2289 }
2290
2291 #define VSAD_INTRA(size) \
2292 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2293     int score=0;                                                                                            \
2294     int x,y;                                                                                                \
2295                                                                                                             \
2296     for(y=1; y<h; y++){                                                                                     \
2297         for(x=0; x<size; x+=4){                                                                             \
2298             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2299                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2300         }                                                                                                   \
2301         s+= stride;                                                                                         \
2302     }                                                                                                       \
2303                                                                                                             \
2304     return score;                                                                                           \
2305 }
2306 VSAD_INTRA(8)
2307 VSAD_INTRA(16)
2308
2309 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2310     int score=0;
2311     int x,y;
2312
2313     for(y=1; y<h; y++){
2314         for(x=0; x<16; x++){
2315             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2316         }
2317         s1+= stride;
2318         s2+= stride;
2319     }
2320
2321     return score;
2322 }
2323
2324 #define SQ(a) ((a)*(a))
2325 #define VSSE_INTRA(size) \
2326 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2327     int score=0;                                                                                            \
2328     int x,y;                                                                                                \
2329                                                                                                             \
2330     for(y=1; y<h; y++){                                                                                     \
2331         for(x=0; x<size; x+=4){                                                                               \
2332             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2333                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2334         }                                                                                                   \
2335         s+= stride;                                                                                         \
2336     }                                                                                                       \
2337                                                                                                             \
2338     return score;                                                                                           \
2339 }
2340 VSSE_INTRA(8)
2341 VSSE_INTRA(16)
2342
2343 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2344     int score=0;
2345     int x,y;
2346
2347     for(y=1; y<h; y++){
2348         for(x=0; x<16; x++){
2349             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2350         }
2351         s1+= stride;
2352         s2+= stride;
2353     }
2354
2355     return score;
2356 }
2357
2358 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2359                                int size){
2360     int score=0;
2361     int i;
2362     for(i=0; i<size; i++)
2363         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2364     return score;
2365 }
2366
2367 #define WRAPPER8_16_SQ(name8, name16)\
2368 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2369     int score=0;\
2370     score +=name8(s, dst           , src           , stride, 8);\
2371     score +=name8(s, dst+8         , src+8         , stride, 8);\
2372     if(h==16){\
2373         dst += 8*stride;\
2374         src += 8*stride;\
2375         score +=name8(s, dst           , src           , stride, 8);\
2376         score +=name8(s, dst+8         , src+8         , stride, 8);\
2377     }\
2378     return score;\
2379 }
2380
2381 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2382 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2383 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2384 #if CONFIG_GPL
2385 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2386 #endif
2387 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2388 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2389 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2390 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2391
2392 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2393                    uint32_t maxi, uint32_t maxisign)
2394 {
2395
2396     if(a > mini) return mini;
2397     else if((a^(1U<<31)) > maxisign) return maxi;
2398     else return a;
2399 }
2400
2401 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2402     int i;
2403     uint32_t mini = *(uint32_t*)min;
2404     uint32_t maxi = *(uint32_t*)max;
2405     uint32_t maxisign = maxi ^ (1U<<31);
2406     uint32_t *dsti = (uint32_t*)dst;
2407     const uint32_t *srci = (const uint32_t*)src;
2408     for(i=0; i<len; i+=8) {
2409         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2410         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2411         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2412         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2413         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2414         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2415         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2416         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2417     }
2418 }
2419 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2420     int i;
2421     if(min < 0 && max > 0) {
2422         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2423     } else {
2424         for(i=0; i < len; i+=8) {
2425             dst[i    ] = av_clipf(src[i    ], min, max);
2426             dst[i + 1] = av_clipf(src[i + 1], min, max);
2427             dst[i + 2] = av_clipf(src[i + 2], min, max);
2428             dst[i + 3] = av_clipf(src[i + 3], min, max);
2429             dst[i + 4] = av_clipf(src[i + 4], min, max);
2430             dst[i + 5] = av_clipf(src[i + 5], min, max);
2431             dst[i + 6] = av_clipf(src[i + 6], min, max);
2432             dst[i + 7] = av_clipf(src[i + 7], min, max);
2433         }
2434     }
2435 }
2436
2437 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2438 {
2439     int res = 0;
2440
2441     while (order--)
2442         res += *v1++ * *v2++;
2443
2444     return res;
2445 }
2446
2447 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2448 {
2449     int res = 0;
2450     while (order--) {
2451         res   += *v1 * *v2++;
2452         *v1++ += mul * *v3++;
2453     }
2454     return res;
2455 }
2456
2457 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2458                                  const int16_t *window, unsigned int len)
2459 {
2460     int i;
2461     int len2 = len >> 1;
2462
2463     for (i = 0; i < len2; i++) {
2464         int16_t w       = window[i];
2465         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2466         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2467     }
2468 }
2469
2470 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2471                                 int32_t max, unsigned int len)
2472 {
2473     do {
2474         *dst++ = av_clip(*src++, min, max);
2475         *dst++ = av_clip(*src++, min, max);
2476         *dst++ = av_clip(*src++, min, max);
2477         *dst++ = av_clip(*src++, min, max);
2478         *dst++ = av_clip(*src++, min, max);
2479         *dst++ = av_clip(*src++, min, max);
2480         *dst++ = av_clip(*src++, min, max);
2481         *dst++ = av_clip(*src++, min, max);
2482         len -= 8;
2483     } while (len > 0);
2484 }
2485
2486 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2487 {
2488     ff_j_rev_dct (block);
2489     put_pixels_clamped_c(block, dest, line_size);
2490 }
2491 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2492 {
2493     ff_j_rev_dct (block);
2494     add_pixels_clamped_c(block, dest, line_size);
2495 }
2496
2497 /* init static data */
2498 av_cold void ff_dsputil_static_init(void)
2499 {
2500     int i;
2501
2502     for(i=0;i<512;i++) {
2503         ff_squareTbl[i] = (i - 256) * (i - 256);
2504     }
2505
2506     for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2507 }
2508
2509 int ff_check_alignment(void){
2510     static int did_fail=0;
2511     LOCAL_ALIGNED_16(int, aligned, [4]);
2512
2513     if((intptr_t)aligned & 15){
2514         if(!did_fail){
2515 #if HAVE_MMX || HAVE_ALTIVEC
2516             av_log(NULL, AV_LOG_ERROR,
2517                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2518                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2519                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2520                 "Do not report crashes to Libav developers.\n");
2521 #endif
2522             did_fail=1;
2523         }
2524         return -1;
2525     }
2526     return 0;
2527 }
2528
2529 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2530 {
2531     ff_check_alignment();
2532
2533 #if CONFIG_ENCODERS
2534     if (avctx->bits_per_raw_sample == 10) {
2535         c->fdct    = ff_jpeg_fdct_islow_10;
2536         c->fdct248 = ff_fdct248_islow_10;
2537     } else {
2538         if(avctx->dct_algo==FF_DCT_FASTINT) {
2539             c->fdct    = ff_fdct_ifast;
2540             c->fdct248 = ff_fdct_ifast248;
2541         }
2542         else if(avctx->dct_algo==FF_DCT_FAAN) {
2543             c->fdct    = ff_faandct;
2544             c->fdct248 = ff_faandct248;
2545         }
2546         else {
2547             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2548             c->fdct248 = ff_fdct248_islow_8;
2549         }
2550     }
2551 #endif //CONFIG_ENCODERS
2552
2553     if (avctx->bits_per_raw_sample == 10) {
2554         c->idct_put              = ff_simple_idct_put_10;
2555         c->idct_add              = ff_simple_idct_add_10;
2556         c->idct                  = ff_simple_idct_10;
2557         c->idct_permutation_type = FF_NO_IDCT_PERM;
2558     } else {
2559         if(avctx->idct_algo==FF_IDCT_INT){
2560             c->idct_put= jref_idct_put;
2561             c->idct_add= jref_idct_add;
2562             c->idct    = ff_j_rev_dct;
2563             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2564         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2565             c->idct_put= ff_faanidct_put;
2566             c->idct_add= ff_faanidct_add;
2567             c->idct    = ff_faanidct;
2568             c->idct_permutation_type= FF_NO_IDCT_PERM;
2569         }else{ //accurate/default
2570             c->idct_put = ff_simple_idct_put_8;
2571             c->idct_add = ff_simple_idct_add_8;
2572             c->idct     = ff_simple_idct_8;
2573             c->idct_permutation_type= FF_NO_IDCT_PERM;
2574         }
2575     }
2576
2577     c->diff_pixels = diff_pixels_c;
2578     c->put_pixels_clamped = put_pixels_clamped_c;
2579     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2580     c->add_pixels_clamped = add_pixels_clamped_c;
2581     c->sum_abs_dctelem = sum_abs_dctelem_c;
2582     c->gmc1 = gmc1_c;
2583     c->gmc = ff_gmc_c;
2584     c->pix_sum = pix_sum_c;
2585     c->pix_norm1 = pix_norm1_c;
2586
2587     c->fill_block_tab[0] = fill_block16_c;
2588     c->fill_block_tab[1] = fill_block8_c;
2589
2590     /* TODO [0] 16  [1] 8 */
2591     c->pix_abs[0][0] = pix_abs16_c;
2592     c->pix_abs[0][1] = pix_abs16_x2_c;
2593     c->pix_abs[0][2] = pix_abs16_y2_c;
2594     c->pix_abs[0][3] = pix_abs16_xy2_c;
2595     c->pix_abs[1][0] = pix_abs8_c;
2596     c->pix_abs[1][1] = pix_abs8_x2_c;
2597     c->pix_abs[1][2] = pix_abs8_y2_c;
2598     c->pix_abs[1][3] = pix_abs8_xy2_c;
2599
2600     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2601     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2602     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2603     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2604     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2605     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2606     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2607     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2608     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2609
2610     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2611     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2612     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2613     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2614     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2615     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2616     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2617     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2618     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2619
2620 #define dspfunc(PFX, IDX, NUM) \
2621     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2622     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2623     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2624     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2625     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2626     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2627     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2628     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2629     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2630     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2631     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2632     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2633     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2634     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2635     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2636     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2637
2638     dspfunc(put_qpel, 0, 16);
2639     dspfunc(put_no_rnd_qpel, 0, 16);
2640
2641     dspfunc(avg_qpel, 0, 16);
2642     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2643
2644     dspfunc(put_qpel, 1, 8);
2645     dspfunc(put_no_rnd_qpel, 1, 8);
2646
2647     dspfunc(avg_qpel, 1, 8);
2648     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2649
2650 #undef dspfunc
2651
2652     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2653     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2654     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2655     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2656     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2657     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2658     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2659     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2660
2661 #define SET_CMP_FUNC(name) \
2662     c->name[0]= name ## 16_c;\
2663     c->name[1]= name ## 8x8_c;
2664
2665     SET_CMP_FUNC(hadamard8_diff)
2666     c->hadamard8_diff[4]= hadamard8_intra16_c;
2667     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2668     SET_CMP_FUNC(dct_sad)
2669     SET_CMP_FUNC(dct_max)
2670 #if CONFIG_GPL
2671     SET_CMP_FUNC(dct264_sad)
2672 #endif
2673     c->sad[0]= pix_abs16_c;
2674     c->sad[1]= pix_abs8_c;
2675     c->sse[0]= sse16_c;
2676     c->sse[1]= sse8_c;
2677     c->sse[2]= sse4_c;
2678     SET_CMP_FUNC(quant_psnr)
2679     SET_CMP_FUNC(rd)
2680     SET_CMP_FUNC(bit)
2681     c->vsad[0]= vsad16_c;
2682     c->vsad[4]= vsad_intra16_c;
2683     c->vsad[5]= vsad_intra8_c;
2684     c->vsse[0]= vsse16_c;
2685     c->vsse[4]= vsse_intra16_c;
2686     c->vsse[5]= vsse_intra8_c;
2687     c->nsse[0]= nsse16_c;
2688     c->nsse[1]= nsse8_c;
2689
2690     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2691
2692     c->add_bytes= add_bytes_c;
2693     c->diff_bytes= diff_bytes_c;
2694     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2695     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2696     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
2697     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2698     c->bswap_buf= bswap_buf;
2699     c->bswap16_buf = bswap16_buf;
2700
2701     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2702         c->h263_h_loop_filter= h263_h_loop_filter_c;
2703         c->h263_v_loop_filter= h263_v_loop_filter_c;
2704     }
2705
2706     c->try_8x8basis= try_8x8basis_c;
2707     c->add_8x8basis= add_8x8basis_c;
2708
2709     c->vector_clipf = vector_clipf_c;
2710     c->scalarproduct_int16 = scalarproduct_int16_c;
2711     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2712     c->apply_window_int16 = apply_window_int16_c;
2713     c->vector_clip_int32 = vector_clip_int32_c;
2714
2715     c->shrink[0]= av_image_copy_plane;
2716     c->shrink[1]= ff_shrink22;
2717     c->shrink[2]= ff_shrink44;
2718     c->shrink[3]= ff_shrink88;
2719
2720     c->add_pixels8 = add_pixels8_c;
2721
2722 #undef FUNC
2723 #undef FUNCC
2724 #define FUNC(f, depth) f ## _ ## depth
2725 #define FUNCC(f, depth) f ## _ ## depth ## _c
2726
2727     c->draw_edges                    = FUNCC(draw_edges, 8);
2728     c->clear_block                   = FUNCC(clear_block, 8);
2729     c->clear_blocks                  = FUNCC(clear_blocks, 8);
2730
2731 #define BIT_DEPTH_FUNCS(depth) \
2732     c->get_pixels                    = FUNCC(get_pixels,   depth);
2733
2734     switch (avctx->bits_per_raw_sample) {
2735     case 9:
2736     case 10:
2737         BIT_DEPTH_FUNCS(16);
2738         break;
2739     default:
2740         BIT_DEPTH_FUNCS(8);
2741         break;
2742     }
2743
2744
2745     if (HAVE_MMX)        ff_dsputil_init_mmx   (c, avctx);
2746     if (ARCH_ARM)        ff_dsputil_init_arm   (c, avctx);
2747     if (HAVE_VIS)        ff_dsputil_init_vis   (c, avctx);
2748     if (ARCH_ALPHA)      ff_dsputil_init_alpha (c, avctx);
2749     if (ARCH_PPC)        ff_dsputil_init_ppc   (c, avctx);
2750     if (ARCH_SH4)        ff_dsputil_init_sh4   (c, avctx);
2751     if (ARCH_BFIN)       ff_dsputil_init_bfin  (c, avctx);
2752
2753     ff_init_scantable_permutation(c->idct_permutation,
2754                                   c->idct_permutation_type);
2755 }