git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/attributes.h"
  31 #include "libavutil/imgutils.h"
  32 #include "libavutil/internal.h"
  33 #include "avcodec.h"
  34 #include "copy_block.h"
  35 #include "dct.h"
  36 #include "dsputil.h"
  37 #include "simple_idct.h"
  38 #include "faandct.h"
  39 #include "faanidct.h"
  40 #include "imgconvert.h"
  41 #include "mathops.h"
  42 #include "mpegvideo.h"
  43 #include "config.h"
  44
  45 uint32_t ff_squareTbl[512] = {0, };
  46
  47 #define BIT_DEPTH 16
  48 #include "dsputil_template.c"
  49 #undef BIT_DEPTH
  50
  51 #define BIT_DEPTH 8
  52 #include "dsputil_template.c"
  53
  54 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  55 #define pb_7f (~0UL/255 * 0x7f)
  56 #define pb_80 (~0UL/255 * 0x80)
  57
  58 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  59    specification, we interleave the fields */
  60 const uint8_t ff_zigzag248_direct[64] = {
  61      0,  8,  1,  9, 16, 24,  2, 10,
  62     17, 25, 32, 40, 48, 56, 33, 41,
  63     18, 26,  3, 11,  4, 12, 19, 27,
  64     34, 42, 49, 57, 50, 58, 35, 43,
  65     20, 28,  5, 13,  6, 14, 21, 29,
  66     36, 44, 51, 59, 52, 60, 37, 45,
  67     22, 30,  7, 15, 23, 31, 38, 46,
  68     53, 61, 54, 62, 39, 47, 55, 63,
  69 };
  70
  71 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  72 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
  73
  74 const uint8_t ff_alternate_horizontal_scan[64] = {
  75     0,  1,   2,  3,  8,  9, 16, 17,
  76     10, 11,  4,  5,  6,  7, 15, 14,
  77     13, 12, 19, 18, 24, 25, 32, 33,
  78     26, 27, 20, 21, 22, 23, 28, 29,
  79     30, 31, 34, 35, 40, 41, 48, 49,
  80     42, 43, 36, 37, 38, 39, 44, 45,
  81     46, 47, 50, 51, 56, 57, 58, 59,
  82     52, 53, 54, 55, 60, 61, 62, 63,
  83 };
  84
  85 const uint8_t ff_alternate_vertical_scan[64] = {
  86     0,  8,  16, 24,  1,  9,  2, 10,
  87     17, 25, 32, 40, 48, 56, 57, 49,
  88     41, 33, 26, 18,  3, 11,  4, 12,
  89     19, 27, 34, 42, 50, 58, 35, 43,
  90     51, 59, 20, 28,  5, 13,  6, 14,
  91     21, 29, 36, 44, 52, 60, 37, 45,
  92     53, 61, 22, 30,  7, 15, 23, 31,
  93     38, 46, 54, 62, 39, 47, 55, 63,
  94 };
  95
  96 /* Input permutation for the simple_idct_mmx */
  97 static const uint8_t simple_mmx_permutation[64]={
  98         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
  99         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 100         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 101         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 102         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 103         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 104         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 105         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 106 };
 107
 108 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 109
 110 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
 111                                const uint8_t *src_scantable)
 112 {
 113     int i;
 114     int end;
 115
 116     st->scantable= src_scantable;
 117
 118     for(i=0; i<64; i++){
 119         int j;
 120         j = src_scantable[i];
 121         st->permutated[i] = permutation[j];
 122     }
 123
 124     end=-1;
 125     for(i=0; i<64; i++){
 126         int j;
 127         j = st->permutated[i];
 128         if(j>end) end=j;
 129         st->raster_end[i]= end;
 130     }
 131 }
 132
 133 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
 134                                            int idct_permutation_type)
 135 {
 136     int i;
 137
 138     switch(idct_permutation_type){
 139     case FF_NO_IDCT_PERM:
 140         for(i=0; i<64; i++)
 141             idct_permutation[i]= i;
 142         break;
 143     case FF_LIBMPEG2_IDCT_PERM:
 144         for(i=0; i<64; i++)
 145             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 146         break;
 147     case FF_SIMPLE_IDCT_PERM:
 148         for(i=0; i<64; i++)
 149             idct_permutation[i]= simple_mmx_permutation[i];
 150         break;
 151     case FF_TRANSPOSE_IDCT_PERM:
 152         for(i=0; i<64; i++)
 153             idct_permutation[i]= ((i&7)<<3) | (i>>3);
 154         break;
 155     case FF_PARTTRANS_IDCT_PERM:
 156         for(i=0; i<64; i++)
 157             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
 158         break;
 159     case FF_SSE2_IDCT_PERM:
 160         for(i=0; i<64; i++)
 161             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
 162         break;
 163     default:
 164         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
 165     }
 166 }
 167
 168 static int pix_sum_c(uint8_t * pix, int line_size)
 169 {
 170     int s, i, j;
 171
 172     s = 0;
 173     for (i = 0; i < 16; i++) {
 174         for (j = 0; j < 16; j += 8) {
 175             s += pix[0];
 176             s += pix[1];
 177             s += pix[2];
 178             s += pix[3];
 179             s += pix[4];
 180             s += pix[5];
 181             s += pix[6];
 182             s += pix[7];
 183             pix += 8;
 184         }
 185         pix += line_size - 16;
 186     }
 187     return s;
 188 }
 189
 190 static int pix_norm1_c(uint8_t * pix, int line_size)
 191 {
 192     int s, i, j;
 193     uint32_t *sq = ff_squareTbl + 256;
 194
 195     s = 0;
 196     for (i = 0; i < 16; i++) {
 197         for (j = 0; j < 16; j += 8) {
 198 #if 0
 199             s += sq[pix[0]];
 200             s += sq[pix[1]];
 201             s += sq[pix[2]];
 202             s += sq[pix[3]];
 203             s += sq[pix[4]];
 204             s += sq[pix[5]];
 205             s += sq[pix[6]];
 206             s += sq[pix[7]];
 207 #else
 208 #if HAVE_FAST_64BIT
 209             register uint64_t x=*(uint64_t*)pix;
 210             s += sq[x&0xff];
 211             s += sq[(x>>8)&0xff];
 212             s += sq[(x>>16)&0xff];
 213             s += sq[(x>>24)&0xff];
 214             s += sq[(x>>32)&0xff];
 215             s += sq[(x>>40)&0xff];
 216             s += sq[(x>>48)&0xff];
 217             s += sq[(x>>56)&0xff];
 218 #else
 219             register uint32_t x=*(uint32_t*)pix;
 220             s += sq[x&0xff];
 221             s += sq[(x>>8)&0xff];
 222             s += sq[(x>>16)&0xff];
 223             s += sq[(x>>24)&0xff];
 224             x=*(uint32_t*)(pix+4);
 225             s += sq[x&0xff];
 226             s += sq[(x>>8)&0xff];
 227             s += sq[(x>>16)&0xff];
 228             s += sq[(x>>24)&0xff];
 229 #endif
 230 #endif
 231             pix += 8;
 232         }
 233         pix += line_size - 16;
 234     }
 235     return s;
 236 }
 237
 238 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 239     int i;
 240
 241     for(i=0; i+8<=w; i+=8){
 242         dst[i+0]= av_bswap32(src[i+0]);
 243         dst[i+1]= av_bswap32(src[i+1]);
 244         dst[i+2]= av_bswap32(src[i+2]);
 245         dst[i+3]= av_bswap32(src[i+3]);
 246         dst[i+4]= av_bswap32(src[i+4]);
 247         dst[i+5]= av_bswap32(src[i+5]);
 248         dst[i+6]= av_bswap32(src[i+6]);
 249         dst[i+7]= av_bswap32(src[i+7]);
 250     }
 251     for(;i<w; i++){
 252         dst[i+0]= av_bswap32(src[i+0]);
 253     }
 254 }
 255
 256 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 257 {
 258     while (len--)
 259         *dst++ = av_bswap16(*src++);
 260 }
 261
 262 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 263 {
 264     int s, i;
 265     uint32_t *sq = ff_squareTbl + 256;
 266
 267     s = 0;
 268     for (i = 0; i < h; i++) {
 269         s += sq[pix1[0] - pix2[0]];
 270         s += sq[pix1[1] - pix2[1]];
 271         s += sq[pix1[2] - pix2[2]];
 272         s += sq[pix1[3] - pix2[3]];
 273         pix1 += line_size;
 274         pix2 += line_size;
 275     }
 276     return s;
 277 }
 278
 279 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 280 {
 281     int s, i;
 282     uint32_t *sq = ff_squareTbl + 256;
 283
 284     s = 0;
 285     for (i = 0; i < h; i++) {
 286         s += sq[pix1[0] - pix2[0]];
 287         s += sq[pix1[1] - pix2[1]];
 288         s += sq[pix1[2] - pix2[2]];
 289         s += sq[pix1[3] - pix2[3]];
 290         s += sq[pix1[4] - pix2[4]];
 291         s += sq[pix1[5] - pix2[5]];
 292         s += sq[pix1[6] - pix2[6]];
 293         s += sq[pix1[7] - pix2[7]];
 294         pix1 += line_size;
 295         pix2 += line_size;
 296     }
 297     return s;
 298 }
 299
 300 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 301 {
 302     int s, i;
 303     uint32_t *sq = ff_squareTbl + 256;
 304
 305     s = 0;
 306     for (i = 0; i < h; i++) {
 307         s += sq[pix1[ 0] - pix2[ 0]];
 308         s += sq[pix1[ 1] - pix2[ 1]];
 309         s += sq[pix1[ 2] - pix2[ 2]];
 310         s += sq[pix1[ 3] - pix2[ 3]];
 311         s += sq[pix1[ 4] - pix2[ 4]];
 312         s += sq[pix1[ 5] - pix2[ 5]];
 313         s += sq[pix1[ 6] - pix2[ 6]];
 314         s += sq[pix1[ 7] - pix2[ 7]];
 315         s += sq[pix1[ 8] - pix2[ 8]];
 316         s += sq[pix1[ 9] - pix2[ 9]];
 317         s += sq[pix1[10] - pix2[10]];
 318         s += sq[pix1[11] - pix2[11]];
 319         s += sq[pix1[12] - pix2[12]];
 320         s += sq[pix1[13] - pix2[13]];
 321         s += sq[pix1[14] - pix2[14]];
 322         s += sq[pix1[15] - pix2[15]];
 323
 324         pix1 += line_size;
 325         pix2 += line_size;
 326     }
 327     return s;
 328 }
 329
 330 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
 331                           const uint8_t *s2, int stride){
 332     int i;
 333
 334     /* read the pixels */
 335     for(i=0;i<8;i++) {
 336         block[0] = s1[0] - s2[0];
 337         block[1] = s1[1] - s2[1];
 338         block[2] = s1[2] - s2[2];
 339         block[3] = s1[3] - s2[3];
 340         block[4] = s1[4] - s2[4];
 341         block[5] = s1[5] - s2[5];
 342         block[6] = s1[6] - s2[6];
 343         block[7] = s1[7] - s2[7];
 344         s1 += stride;
 345         s2 += stride;
 346         block += 8;
 347     }
 348 }
 349
 350
 351 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 352                                  int line_size)
 353 {
 354     int i;
 355
 356     /* read the pixels */
 357     for(i=0;i<8;i++) {
 358         pixels[0] = av_clip_uint8(block[0]);
 359         pixels[1] = av_clip_uint8(block[1]);
 360         pixels[2] = av_clip_uint8(block[2]);
 361         pixels[3] = av_clip_uint8(block[3]);
 362         pixels[4] = av_clip_uint8(block[4]);
 363         pixels[5] = av_clip_uint8(block[5]);
 364         pixels[6] = av_clip_uint8(block[6]);
 365         pixels[7] = av_clip_uint8(block[7]);
 366
 367         pixels += line_size;
 368         block += 8;
 369     }
 370 }
 371
 372 static void put_signed_pixels_clamped_c(const int16_t *block,
 373                                         uint8_t *restrict pixels,
 374                                         int line_size)
 375 {
 376     int i, j;
 377
 378     for (i = 0; i < 8; i++) {
 379         for (j = 0; j < 8; j++) {
 380             if (*block < -128)
 381                 *pixels = 0;
 382             else if (*block > 127)
 383                 *pixels = 255;
 384             else
 385                 *pixels = (uint8_t)(*block + 128);
 386             block++;
 387             pixels++;
 388         }
 389         pixels += (line_size - 8);
 390     }
 391 }
 392
 393 static void add_pixels8_c(uint8_t *restrict pixels,
 394                           int16_t *block,
 395                           int line_size)
 396 {
 397     int i;
 398
 399     for(i=0;i<8;i++) {
 400         pixels[0] += block[0];
 401         pixels[1] += block[1];
 402         pixels[2] += block[2];
 403         pixels[3] += block[3];
 404         pixels[4] += block[4];
 405         pixels[5] += block[5];
 406         pixels[6] += block[6];
 407         pixels[7] += block[7];
 408         pixels += line_size;
 409         block += 8;
 410     }
 411 }
 412
 413 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 414                                  int line_size)
 415 {
 416     int i;
 417
 418     /* read the pixels */
 419     for(i=0;i<8;i++) {
 420         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 421         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 422         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 423         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 424         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
 425         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
 426         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
 427         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
 428         pixels += line_size;
 429         block += 8;
 430     }
 431 }
 432
 433 static int sum_abs_dctelem_c(int16_t *block)
 434 {
 435     int sum=0, i;
 436     for(i=0; i<64; i++)
 437         sum+= FFABS(block[i]);
 438     return sum;
 439 }
 440
 441 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 442 {
 443     int i;
 444
 445     for (i = 0; i < h; i++) {
 446         memset(block, value, 16);
 447         block += line_size;
 448     }
 449 }
 450
 451 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 452 {
 453     int i;
 454
 455     for (i = 0; i < h; i++) {
 456         memset(block, value, 8);
 457         block += line_size;
 458     }
 459 }
 460
 461 #define avg2(a,b) ((a+b+1)>>1)
 462 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 463
 464 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 465 {
 466     const int A=(16-x16)*(16-y16);
 467     const int B=(   x16)*(16-y16);
 468     const int C=(16-x16)*(   y16);
 469     const int D=(   x16)*(   y16);
 470     int i;
 471
 472     for(i=0; i<h; i++)
 473     {
 474         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 475         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 476         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 477         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 478         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 479         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 480         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 481         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 482         dst+= stride;
 483         src+= stride;
 484     }
 485 }
 486
 487 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 488                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 489 {
 490     int y, vx, vy;
 491     const int s= 1<<shift;
 492
 493     width--;
 494     height--;
 495
 496     for(y=0; y<h; y++){
 497         int x;
 498
 499         vx= ox;
 500         vy= oy;
 501         for(x=0; x<8; x++){ //XXX FIXME optimize
 502             int src_x, src_y, frac_x, frac_y, index;
 503
 504             src_x= vx>>16;
 505             src_y= vy>>16;
 506             frac_x= src_x&(s-1);
 507             frac_y= src_y&(s-1);
 508             src_x>>=shift;
 509             src_y>>=shift;
 510
 511             if((unsigned)src_x < width){
 512                 if((unsigned)src_y < height){
 513                     index= src_x + src_y*stride;
 514                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 515                                            + src[index       +1]*   frac_x )*(s-frac_y)
 516                                         + (  src[index+stride  ]*(s-frac_x)
 517                                            + src[index+stride+1]*   frac_x )*   frac_y
 518                                         + r)>>(shift*2);
 519                 }else{
 520                     index= src_x + av_clip(src_y, 0, height)*stride;
 521                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 522                                           + src[index       +1]*   frac_x )*s
 523                                         + r)>>(shift*2);
 524                 }
 525             }else{
 526                 if((unsigned)src_y < height){
 527                     index= av_clip(src_x, 0, width) + src_y*stride;
 528                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 529                                            + src[index+stride  ]*   frac_y )*s
 530                                         + r)>>(shift*2);
 531                 }else{
 532                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 533                     dst[y*stride + x]=    src[index         ];
 534                 }
 535             }
 536
 537             vx+= dxx;
 538             vy+= dyx;
 539         }
 540         ox += dxy;
 541         oy += dyy;
 542     }
 543 }
 544
 545 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 546     switch(width){
 547     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 548     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 549     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 550     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 551     }
 552 }
 553
 554 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 555     int i,j;
 556     for (i=0; i < height; i++) {
 557       for (j=0; j < width; j++) {
 558         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 559       }
 560       src += stride;
 561       dst += stride;
 562     }
 563 }
 564
 565 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 566     int i,j;
 567     for (i=0; i < height; i++) {
 568       for (j=0; j < width; j++) {
 569         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 570       }
 571       src += stride;
 572       dst += stride;
 573     }
 574 }
 575
 576 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 577     int i,j;
 578     for (i=0; i < height; i++) {
 579       for (j=0; j < width; j++) {
 580         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 581       }
 582       src += stride;
 583       dst += stride;
 584     }
 585 }
 586
 587 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 588     int i,j;
 589     for (i=0; i < height; i++) {
 590       for (j=0; j < width; j++) {
 591         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 592       }
 593       src += stride;
 594       dst += stride;
 595     }
 596 }
 597
 598 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 599     int i,j;
 600     for (i=0; i < height; i++) {
 601       for (j=0; j < width; j++) {
 602         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 603       }
 604       src += stride;
 605       dst += stride;
 606     }
 607 }
 608
 609 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 610     int i,j;
 611     for (i=0; i < height; i++) {
 612       for (j=0; j < width; j++) {
 613         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 614       }
 615       src += stride;
 616       dst += stride;
 617     }
 618 }
 619
 620 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 621     int i,j;
 622     for (i=0; i < height; i++) {
 623       for (j=0; j < width; j++) {
 624         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 625       }
 626       src += stride;
 627       dst += stride;
 628     }
 629 }
 630
 631 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 632     int i,j;
 633     for (i=0; i < height; i++) {
 634       for (j=0; j < width; j++) {
 635         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 636       }
 637       src += stride;
 638       dst += stride;
 639     }
 640 }
 641
 642 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 643     switch(width){
 644     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 645     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 646     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 647     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 648     }
 649 }
 650
 651 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 652     int i,j;
 653     for (i=0; i < height; i++) {
 654       for (j=0; j < width; j++) {
 655         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 656       }
 657       src += stride;
 658       dst += stride;
 659     }
 660 }
 661
 662 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 663     int i,j;
 664     for (i=0; i < height; i++) {
 665       for (j=0; j < width; j++) {
 666         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 667       }
 668       src += stride;
 669       dst += stride;
 670     }
 671 }
 672
 673 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 674     int i,j;
 675     for (i=0; i < height; i++) {
 676       for (j=0; j < width; j++) {
 677         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 678       }
 679       src += stride;
 680       dst += stride;
 681     }
 682 }
 683
 684 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 685     int i,j;
 686     for (i=0; i < height; i++) {
 687       for (j=0; j < width; j++) {
 688         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 689       }
 690       src += stride;
 691       dst += stride;
 692     }
 693 }
 694
 695 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 696     int i,j;
 697     for (i=0; i < height; i++) {
 698       for (j=0; j < width; j++) {
 699         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 700       }
 701       src += stride;
 702       dst += stride;
 703     }
 704 }
 705
 706 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 707     int i,j;
 708     for (i=0; i < height; i++) {
 709       for (j=0; j < width; j++) {
 710         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 711       }
 712       src += stride;
 713       dst += stride;
 714     }
 715 }
 716
 717 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 718     int i,j;
 719     for (i=0; i < height; i++) {
 720       for (j=0; j < width; j++) {
 721         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 722       }
 723       src += stride;
 724       dst += stride;
 725     }
 726 }
 727
 728 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 729     int i,j;
 730     for (i=0; i < height; i++) {
 731       for (j=0; j < width; j++) {
 732         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 733       }
 734       src += stride;
 735       dst += stride;
 736     }
 737 }
 738
 739 #define QPEL_MC(r, OPNAME, RND, OP) \
 740 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 741     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 742     int i;\
 743     for(i=0; i<h; i++)\
 744     {\
 745         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 746         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 747         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 748         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 749         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 750         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 751         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 752         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 753         dst+=dstStride;\
 754         src+=srcStride;\
 755     }\
 756 }\
 757 \
 758 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 759     const int w=8;\
 760     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 761     int i;\
 762     for(i=0; i<w; i++)\
 763     {\
 764         const int src0= src[0*srcStride];\
 765         const int src1= src[1*srcStride];\
 766         const int src2= src[2*srcStride];\
 767         const int src3= src[3*srcStride];\
 768         const int src4= src[4*srcStride];\
 769         const int src5= src[5*srcStride];\
 770         const int src6= src[6*srcStride];\
 771         const int src7= src[7*srcStride];\
 772         const int src8= src[8*srcStride];\
 773         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 774         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 775         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 776         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 777         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 778         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 779         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 780         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 781         dst++;\
 782         src++;\
 783     }\
 784 }\
 785 \
 786 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 787     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 788     int i;\
 789     \
 790     for(i=0; i<h; i++)\
 791     {\
 792         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 793         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 794         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 795         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 796         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 797         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 798         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 799         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 800         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 801         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 802         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 803         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 804         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 805         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 806         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 807         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 808         dst+=dstStride;\
 809         src+=srcStride;\
 810     }\
 811 }\
 812 \
 813 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 814     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 815     int i;\
 816     const int w=16;\
 817     for(i=0; i<w; i++)\
 818     {\
 819         const int src0= src[0*srcStride];\
 820         const int src1= src[1*srcStride];\
 821         const int src2= src[2*srcStride];\
 822         const int src3= src[3*srcStride];\
 823         const int src4= src[4*srcStride];\
 824         const int src5= src[5*srcStride];\
 825         const int src6= src[6*srcStride];\
 826         const int src7= src[7*srcStride];\
 827         const int src8= src[8*srcStride];\
 828         const int src9= src[9*srcStride];\
 829         const int src10= src[10*srcStride];\
 830         const int src11= src[11*srcStride];\
 831         const int src12= src[12*srcStride];\
 832         const int src13= src[13*srcStride];\
 833         const int src14= src[14*srcStride];\
 834         const int src15= src[15*srcStride];\
 835         const int src16= src[16*srcStride];\
 836         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 837         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 838         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 839         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 840         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 841         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 842         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 843         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 844         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 845         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 846         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 847         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 848         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 849         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 850         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 851         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 852         dst++;\
 853         src++;\
 854     }\
 855 }\
 856 \
 857 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 858 {\
 859     uint8_t half[64];\
 860     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 861     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 862 }\
 863 \
 864 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 865 {\
 866     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 867 }\
 868 \
 869 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 870 {\
 871     uint8_t half[64];\
 872     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 873     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 874 }\
 875 \
 876 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 877 {\
 878     uint8_t full[16*9];\
 879     uint8_t half[64];\
 880     copy_block9(full, src, 16, stride, 9);\
 881     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 882     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 883 }\
 884 \
 885 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 886 {\
 887     uint8_t full[16*9];\
 888     copy_block9(full, src, 16, stride, 9);\
 889     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 890 }\
 891 \
 892 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 893 {\
 894     uint8_t full[16*9];\
 895     uint8_t half[64];\
 896     copy_block9(full, src, 16, stride, 9);\
 897     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 898     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 899 }\
 900 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 901 {\
 902     uint8_t full[16*9];\
 903     uint8_t halfH[72];\
 904     uint8_t halfV[64];\
 905     uint8_t halfHV[64];\
 906     copy_block9(full, src, 16, stride, 9);\
 907     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 908     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 909     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 910     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 911 }\
 912 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 913 {\
 914     uint8_t full[16*9];\
 915     uint8_t halfH[72];\
 916     uint8_t halfHV[64];\
 917     copy_block9(full, src, 16, stride, 9);\
 918     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 919     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 920     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 921     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 922 }\
 923 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 924 {\
 925     uint8_t full[16*9];\
 926     uint8_t halfH[72];\
 927     uint8_t halfV[64];\
 928     uint8_t halfHV[64];\
 929     copy_block9(full, src, 16, stride, 9);\
 930     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 931     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 932     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 933     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 934 }\
 935 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 936 {\
 937     uint8_t full[16*9];\
 938     uint8_t halfH[72];\
 939     uint8_t halfHV[64];\
 940     copy_block9(full, src, 16, stride, 9);\
 941     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 942     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 943     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 944     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 945 }\
 946 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 947 {\
 948     uint8_t full[16*9];\
 949     uint8_t halfH[72];\
 950     uint8_t halfV[64];\
 951     uint8_t halfHV[64];\
 952     copy_block9(full, src, 16, stride, 9);\
 953     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 954     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 955     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 956     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 957 }\
 958 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 959 {\
 960     uint8_t full[16*9];\
 961     uint8_t halfH[72];\
 962     uint8_t halfHV[64];\
 963     copy_block9(full, src, 16, stride, 9);\
 964     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 965     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 966     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 967     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 968 }\
 969 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 970 {\
 971     uint8_t full[16*9];\
 972     uint8_t halfH[72];\
 973     uint8_t halfV[64];\
 974     uint8_t halfHV[64];\
 975     copy_block9(full, src, 16, stride, 9);\
 976     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
 977     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 978     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 979     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 980 }\
 981 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 982 {\
 983     uint8_t full[16*9];\
 984     uint8_t halfH[72];\
 985     uint8_t halfHV[64];\
 986     copy_block9(full, src, 16, stride, 9);\
 987     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 988     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 989     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 990     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 991 }\
 992 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 993 {\
 994     uint8_t halfH[72];\
 995     uint8_t halfHV[64];\
 996     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
 997     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 998     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 999 }\
1000 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1001 {\
1002     uint8_t halfH[72];\
1003     uint8_t halfHV[64];\
1004     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1005     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1006     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1007 }\
1008 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1009 {\
1010     uint8_t full[16*9];\
1011     uint8_t halfH[72];\
1012     uint8_t halfV[64];\
1013     uint8_t halfHV[64];\
1014     copy_block9(full, src, 16, stride, 9);\
1015     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1017     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1019 }\
1020 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1021 {\
1022     uint8_t full[16*9];\
1023     uint8_t halfH[72];\
1024     copy_block9(full, src, 16, stride, 9);\
1025     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1026     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1027     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1028 }\
1029 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1030 {\
1031     uint8_t full[16*9];\
1032     uint8_t halfH[72];\
1033     uint8_t halfV[64];\
1034     uint8_t halfHV[64];\
1035     copy_block9(full, src, 16, stride, 9);\
1036     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1037     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1038     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1039     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1040 }\
1041 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1042 {\
1043     uint8_t full[16*9];\
1044     uint8_t halfH[72];\
1045     copy_block9(full, src, 16, stride, 9);\
1046     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1047     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1048     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1049 }\
1050 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1051 {\
1052     uint8_t halfH[72];\
1053     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1054     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1055 }\
1056 \
1057 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1058 {\
1059     uint8_t half[256];\
1060     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1061     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1062 }\
1063 \
1064 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1065 {\
1066     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1067 }\
1068 \
1069 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1070 {\
1071     uint8_t half[256];\
1072     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1073     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1074 }\
1075 \
1076 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1077 {\
1078     uint8_t full[24*17];\
1079     uint8_t half[256];\
1080     copy_block17(full, src, 24, stride, 17);\
1081     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1082     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1083 }\
1084 \
1085 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1086 {\
1087     uint8_t full[24*17];\
1088     copy_block17(full, src, 24, stride, 17);\
1089     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1090 }\
1091 \
1092 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1093 {\
1094     uint8_t full[24*17];\
1095     uint8_t half[256];\
1096     copy_block17(full, src, 24, stride, 17);\
1097     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1098     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1099 }\
1100 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1101 {\
1102     uint8_t full[24*17];\
1103     uint8_t halfH[272];\
1104     uint8_t halfV[256];\
1105     uint8_t halfHV[256];\
1106     copy_block17(full, src, 24, stride, 17);\
1107     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1108     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1109     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1110     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1111 }\
1112 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1113 {\
1114     uint8_t full[24*17];\
1115     uint8_t halfH[272];\
1116     uint8_t halfHV[256];\
1117     copy_block17(full, src, 24, stride, 17);\
1118     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1119     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1120     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1121     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1122 }\
1123 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1124 {\
1125     uint8_t full[24*17];\
1126     uint8_t halfH[272];\
1127     uint8_t halfV[256];\
1128     uint8_t halfHV[256];\
1129     copy_block17(full, src, 24, stride, 17);\
1130     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1131     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1132     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1133     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1134 }\
1135 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1136 {\
1137     uint8_t full[24*17];\
1138     uint8_t halfH[272];\
1139     uint8_t halfHV[256];\
1140     copy_block17(full, src, 24, stride, 17);\
1141     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1142     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1143     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1144     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1145 }\
1146 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1147 {\
1148     uint8_t full[24*17];\
1149     uint8_t halfH[272];\
1150     uint8_t halfV[256];\
1151     uint8_t halfHV[256];\
1152     copy_block17(full, src, 24, stride, 17);\
1153     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1154     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1155     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1156     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1157 }\
1158 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1159 {\
1160     uint8_t full[24*17];\
1161     uint8_t halfH[272];\
1162     uint8_t halfHV[256];\
1163     copy_block17(full, src, 24, stride, 17);\
1164     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1165     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1166     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1167     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1168 }\
1169 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1170 {\
1171     uint8_t full[24*17];\
1172     uint8_t halfH[272];\
1173     uint8_t halfV[256];\
1174     uint8_t halfHV[256];\
1175     copy_block17(full, src, 24, stride, 17);\
1176     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1177     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1178     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1179     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1180 }\
1181 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1182 {\
1183     uint8_t full[24*17];\
1184     uint8_t halfH[272];\
1185     uint8_t halfHV[256];\
1186     copy_block17(full, src, 24, stride, 17);\
1187     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1188     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1189     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1190     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1191 }\
1192 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1193 {\
1194     uint8_t halfH[272];\
1195     uint8_t halfHV[256];\
1196     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1197     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1198     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1199 }\
1200 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1201 {\
1202     uint8_t halfH[272];\
1203     uint8_t halfHV[256];\
1204     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1205     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1206     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1207 }\
1208 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1209 {\
1210     uint8_t full[24*17];\
1211     uint8_t halfH[272];\
1212     uint8_t halfV[256];\
1213     uint8_t halfHV[256];\
1214     copy_block17(full, src, 24, stride, 17);\
1215     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1217     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1219 }\
1220 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1221 {\
1222     uint8_t full[24*17];\
1223     uint8_t halfH[272];\
1224     copy_block17(full, src, 24, stride, 17);\
1225     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1226     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1227     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1228 }\
1229 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1230 {\
1231     uint8_t full[24*17];\
1232     uint8_t halfH[272];\
1233     uint8_t halfV[256];\
1234     uint8_t halfHV[256];\
1235     copy_block17(full, src, 24, stride, 17);\
1236     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1237     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1238     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1239     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1240 }\
1241 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1242 {\
1243     uint8_t full[24*17];\
1244     uint8_t halfH[272];\
1245     copy_block17(full, src, 24, stride, 17);\
1246     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1247     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1248     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1249 }\
1250 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1251 {\
1252     uint8_t halfH[272];\
1253     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1254     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1255 }
1256
1257 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1258 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1259 #define op_put(a, b) a = cm[((b) + 16)>>5]
1260 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1261
1262 QPEL_MC(0, put_       , _       , op_put)
1263 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1264 QPEL_MC(0, avg_       , _       , op_avg)
1265 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1266 #undef op_avg
1267 #undef op_avg_no_rnd
1268 #undef op_put
1269 #undef op_put_no_rnd
1270
1271 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1272 {
1273     put_pixels8_8_c(dst, src, stride, 8);
1274 }
1275 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1276 {
1277     avg_pixels8_8_c(dst, src, stride, 8);
1278 }
1279 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1280 {
1281     put_pixels16_8_c(dst, src, stride, 16);
1282 }
1283 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1284 {
1285     avg_pixels16_8_c(dst, src, stride, 16);
1286 }
1287
1288 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1289 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1290 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1291 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1292 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1293 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1294
1295 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1296     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1297     int i;
1298
1299     for(i=0; i<h; i++){
1300         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1301         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1302         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1303         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1304         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1305         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1306         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1307         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1308         dst+=dstStride;
1309         src+=srcStride;
1310     }
1311 }
1312
1313 #if CONFIG_RV40_DECODER
1314 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1315 {
1316     put_pixels16_xy2_8_c(dst, src, stride, 16);
1317 }
1318 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1319 {
1320     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1321 }
1322 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1323 {
1324     put_pixels8_xy2_8_c(dst, src, stride, 8);
1325 }
1326 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1327 {
1328     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1329 }
1330 #endif /* CONFIG_RV40_DECODER */
1331
1332 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1333     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1334     int i;
1335
1336     for(i=0; i<w; i++){
1337         const int src_1= src[ -srcStride];
1338         const int src0 = src[0          ];
1339         const int src1 = src[  srcStride];
1340         const int src2 = src[2*srcStride];
1341         const int src3 = src[3*srcStride];
1342         const int src4 = src[4*srcStride];
1343         const int src5 = src[5*srcStride];
1344         const int src6 = src[6*srcStride];
1345         const int src7 = src[7*srcStride];
1346         const int src8 = src[8*srcStride];
1347         const int src9 = src[9*srcStride];
1348         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1349         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1350         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1351         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1352         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1353         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1354         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1355         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1356         src++;
1357         dst++;
1358     }
1359 }
1360
1361 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1362 {
1363     uint8_t half[64];
1364     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1365     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1366 }
1367
1368 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1369 {
1370     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1371 }
1372
1373 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1374 {
1375     uint8_t half[64];
1376     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1377     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1378 }
1379
1380 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1381 {
1382     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1383 }
1384
1385 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1386 {
1387     uint8_t halfH[88];
1388     uint8_t halfV[64];
1389     uint8_t halfHV[64];
1390     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1391     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1392     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1393     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1394 }
1395 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1396 {
1397     uint8_t halfH[88];
1398     uint8_t halfV[64];
1399     uint8_t halfHV[64];
1400     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1401     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1402     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1403     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1404 }
1405 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1406 {
1407     uint8_t halfH[88];
1408     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1409     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1410 }
1411
1412 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1413 {
1414     int s, i;
1415
1416     s = 0;
1417     for(i=0;i<h;i++) {
1418         s += abs(pix1[0] - pix2[0]);
1419         s += abs(pix1[1] - pix2[1]);
1420         s += abs(pix1[2] - pix2[2]);
1421         s += abs(pix1[3] - pix2[3]);
1422         s += abs(pix1[4] - pix2[4]);
1423         s += abs(pix1[5] - pix2[5]);
1424         s += abs(pix1[6] - pix2[6]);
1425         s += abs(pix1[7] - pix2[7]);
1426         s += abs(pix1[8] - pix2[8]);
1427         s += abs(pix1[9] - pix2[9]);
1428         s += abs(pix1[10] - pix2[10]);
1429         s += abs(pix1[11] - pix2[11]);
1430         s += abs(pix1[12] - pix2[12]);
1431         s += abs(pix1[13] - pix2[13]);
1432         s += abs(pix1[14] - pix2[14]);
1433         s += abs(pix1[15] - pix2[15]);
1434         pix1 += line_size;
1435         pix2 += line_size;
1436     }
1437     return s;
1438 }
1439
1440 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1441 {
1442     int s, i;
1443
1444     s = 0;
1445     for(i=0;i<h;i++) {
1446         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1447         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1448         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1449         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1450         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1451         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1452         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1453         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1454         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1455         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1456         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1457         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1458         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1459         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1460         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1461         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1462         pix1 += line_size;
1463         pix2 += line_size;
1464     }
1465     return s;
1466 }
1467
1468 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1469 {
1470     int s, i;
1471     uint8_t *pix3 = pix2 + line_size;
1472
1473     s = 0;
1474     for(i=0;i<h;i++) {
1475         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1476         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1477         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1478         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1479         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1480         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1481         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1482         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1483         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1484         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1485         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1486         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1487         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1488         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1489         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1490         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1491         pix1 += line_size;
1492         pix2 += line_size;
1493         pix3 += line_size;
1494     }
1495     return s;
1496 }
1497
1498 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1499 {
1500     int s, i;
1501     uint8_t *pix3 = pix2 + line_size;
1502
1503     s = 0;
1504     for(i=0;i<h;i++) {
1505         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1506         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1507         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1508         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1509         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1510         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1511         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1512         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1513         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1514         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1515         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1516         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1517         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1518         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1519         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1520         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1521         pix1 += line_size;
1522         pix2 += line_size;
1523         pix3 += line_size;
1524     }
1525     return s;
1526 }
1527
1528 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1529 {
1530     int s, i;
1531
1532     s = 0;
1533     for(i=0;i<h;i++) {
1534         s += abs(pix1[0] - pix2[0]);
1535         s += abs(pix1[1] - pix2[1]);
1536         s += abs(pix1[2] - pix2[2]);
1537         s += abs(pix1[3] - pix2[3]);
1538         s += abs(pix1[4] - pix2[4]);
1539         s += abs(pix1[5] - pix2[5]);
1540         s += abs(pix1[6] - pix2[6]);
1541         s += abs(pix1[7] - pix2[7]);
1542         pix1 += line_size;
1543         pix2 += line_size;
1544     }
1545     return s;
1546 }
1547
1548 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1549 {
1550     int s, i;
1551
1552     s = 0;
1553     for(i=0;i<h;i++) {
1554         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1555         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1556         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1557         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1558         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1559         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1560         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1561         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1562         pix1 += line_size;
1563         pix2 += line_size;
1564     }
1565     return s;
1566 }
1567
1568 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1569 {
1570     int s, i;
1571     uint8_t *pix3 = pix2 + line_size;
1572
1573     s = 0;
1574     for(i=0;i<h;i++) {
1575         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1576         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1577         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1578         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1579         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1580         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1581         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1582         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1583         pix1 += line_size;
1584         pix2 += line_size;
1585         pix3 += line_size;
1586     }
1587     return s;
1588 }
1589
1590 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1591 {
1592     int s, i;
1593     uint8_t *pix3 = pix2 + line_size;
1594
1595     s = 0;
1596     for(i=0;i<h;i++) {
1597         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1598         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1599         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1600         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1601         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1602         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1603         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1604         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1605         pix1 += line_size;
1606         pix2 += line_size;
1607         pix3 += line_size;
1608     }
1609     return s;
1610 }
1611
1612 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1613     MpegEncContext *c = v;
1614     int score1=0;
1615     int score2=0;
1616     int x,y;
1617
1618     for(y=0; y<h; y++){
1619         for(x=0; x<16; x++){
1620             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1621         }
1622         if(y+1<h){
1623             for(x=0; x<15; x++){
1624                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1625                              - s1[x+1] + s1[x+1+stride])
1626                         -FFABS(  s2[x  ] - s2[x  +stride]
1627                              - s2[x+1] + s2[x+1+stride]);
1628             }
1629         }
1630         s1+= stride;
1631         s2+= stride;
1632     }
1633
1634     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1635     else  return score1 + FFABS(score2)*8;
1636 }
1637
1638 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1639     MpegEncContext *c = v;
1640     int score1=0;
1641     int score2=0;
1642     int x,y;
1643
1644     for(y=0; y<h; y++){
1645         for(x=0; x<8; x++){
1646             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1647         }
1648         if(y+1<h){
1649             for(x=0; x<7; x++){
1650                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1651                              - s1[x+1] + s1[x+1+stride])
1652                         -FFABS(  s2[x  ] - s2[x  +stride]
1653                              - s2[x+1] + s2[x+1+stride]);
1654             }
1655         }
1656         s1+= stride;
1657         s2+= stride;
1658     }
1659
1660     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1661     else  return score1 + FFABS(score2)*8;
1662 }
1663
1664 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1665     int i;
1666     unsigned int sum=0;
1667
1668     for(i=0; i<8*8; i++){
1669         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1670         int w= weight[i];
1671         b>>= RECON_SHIFT;
1672         assert(-512<b && b<512);
1673
1674         sum += (w*b)*(w*b)>>4;
1675     }
1676     return sum>>2;
1677 }
1678
1679 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1680     int i;
1681
1682     for(i=0; i<8*8; i++){
1683         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1684     }
1685 }
1686
1687 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1688     return 0;
1689 }
1690
1691 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1692     int i;
1693
1694     memset(cmp, 0, sizeof(void*)*6);
1695
1696     for(i=0; i<6; i++){
1697         switch(type&0xFF){
1698         case FF_CMP_SAD:
1699             cmp[i]= c->sad[i];
1700             break;
1701         case FF_CMP_SATD:
1702             cmp[i]= c->hadamard8_diff[i];
1703             break;
1704         case FF_CMP_SSE:
1705             cmp[i]= c->sse[i];
1706             break;
1707         case FF_CMP_DCT:
1708             cmp[i]= c->dct_sad[i];
1709             break;
1710         case FF_CMP_DCT264:
1711             cmp[i]= c->dct264_sad[i];
1712             break;
1713         case FF_CMP_DCTMAX:
1714             cmp[i]= c->dct_max[i];
1715             break;
1716         case FF_CMP_PSNR:
1717             cmp[i]= c->quant_psnr[i];
1718             break;
1719         case FF_CMP_BIT:
1720             cmp[i]= c->bit[i];
1721             break;
1722         case FF_CMP_RD:
1723             cmp[i]= c->rd[i];
1724             break;
1725         case FF_CMP_VSAD:
1726             cmp[i]= c->vsad[i];
1727             break;
1728         case FF_CMP_VSSE:
1729             cmp[i]= c->vsse[i];
1730             break;
1731         case FF_CMP_ZERO:
1732             cmp[i]= zero_cmp;
1733             break;
1734         case FF_CMP_NSSE:
1735             cmp[i]= c->nsse[i];
1736             break;
1737         default:
1738             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1739         }
1740     }
1741 }
1742
1743 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1744     long i;
1745     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1746         long a = *(long*)(src+i);
1747         long b = *(long*)(dst+i);
1748         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1749     }
1750     for(; i<w; i++)
1751         dst[i+0] += src[i+0];
1752 }
1753
1754 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1755     long i;
1756 #if !HAVE_FAST_UNALIGNED
1757     if((long)src2 & (sizeof(long)-1)){
1758         for(i=0; i+7<w; i+=8){
1759             dst[i+0] = src1[i+0]-src2[i+0];
1760             dst[i+1] = src1[i+1]-src2[i+1];
1761             dst[i+2] = src1[i+2]-src2[i+2];
1762             dst[i+3] = src1[i+3]-src2[i+3];
1763             dst[i+4] = src1[i+4]-src2[i+4];
1764             dst[i+5] = src1[i+5]-src2[i+5];
1765             dst[i+6] = src1[i+6]-src2[i+6];
1766             dst[i+7] = src1[i+7]-src2[i+7];
1767         }
1768     }else
1769 #endif
1770     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1771         long a = *(long*)(src1+i);
1772         long b = *(long*)(src2+i);
1773         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1774     }
1775     for(; i<w; i++)
1776         dst[i+0] = src1[i+0]-src2[i+0];
1777 }
1778
1779 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1780     int i;
1781     uint8_t l, lt;
1782
1783     l= *left;
1784     lt= *left_top;
1785
1786     for(i=0; i<w; i++){
1787         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1788         lt= src1[i];
1789         dst[i]= l;
1790     }
1791
1792     *left= l;
1793     *left_top= lt;
1794 }
1795
1796 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1797     int i;
1798     uint8_t l, lt;
1799
1800     l= *left;
1801     lt= *left_top;
1802
1803     for(i=0; i<w; i++){
1804         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1805         lt= src1[i];
1806         l= src2[i];
1807         dst[i]= l - pred;
1808     }
1809
1810     *left= l;
1811     *left_top= lt;
1812 }
1813
1814 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1815     int i;
1816
1817     for(i=0; i<w-1; i++){
1818         acc+= src[i];
1819         dst[i]= acc;
1820         i++;
1821         acc+= src[i];
1822         dst[i]= acc;
1823     }
1824
1825     for(; i<w; i++){
1826         acc+= src[i];
1827         dst[i]= acc;
1828     }
1829
1830     return acc;
1831 }
1832
1833 #if HAVE_BIGENDIAN
1834 #define B 3
1835 #define G 2
1836 #define R 1
1837 #define A 0
1838 #else
1839 #define B 0
1840 #define G 1
1841 #define R 2
1842 #define A 3
1843 #endif
1844 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1845     int i;
1846     int r,g,b,a;
1847     r= *red;
1848     g= *green;
1849     b= *blue;
1850     a= *alpha;
1851
1852     for(i=0; i<w; i++){
1853         b+= src[4*i+B];
1854         g+= src[4*i+G];
1855         r+= src[4*i+R];
1856         a+= src[4*i+A];
1857
1858         dst[4*i+B]= b;
1859         dst[4*i+G]= g;
1860         dst[4*i+R]= r;
1861         dst[4*i+A]= a;
1862     }
1863
1864     *red= r;
1865     *green= g;
1866     *blue= b;
1867     *alpha= a;
1868 }
1869 #undef B
1870 #undef G
1871 #undef R
1872 #undef A
1873
1874 #define BUTTERFLY2(o1,o2,i1,i2) \
1875 o1= (i1)+(i2);\
1876 o2= (i1)-(i2);
1877
1878 #define BUTTERFLY1(x,y) \
1879 {\
1880     int a,b;\
1881     a= x;\
1882     b= y;\
1883     x= a+b;\
1884     y= a-b;\
1885 }
1886
1887 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1888
1889 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1890     int i;
1891     int temp[64];
1892     int sum=0;
1893
1894     assert(h==8);
1895
1896     for(i=0; i<8; i++){
1897         //FIXME try pointer walks
1898         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1899         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1900         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1901         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1902
1903         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1904         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1905         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1906         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1907
1908         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1909         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1910         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1911         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1912     }
1913
1914     for(i=0; i<8; i++){
1915         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1916         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1917         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1918         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1919
1920         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1921         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1922         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1923         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1924
1925         sum +=
1926              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1927             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1928             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1929             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1930     }
1931     return sum;
1932 }
1933
1934 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1935     int i;
1936     int temp[64];
1937     int sum=0;
1938
1939     assert(h==8);
1940
1941     for(i=0; i<8; i++){
1942         //FIXME try pointer walks
1943         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
1944         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
1945         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
1946         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
1947
1948         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1949         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1950         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1951         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1952
1953         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1954         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1955         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1956         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1957     }
1958
1959     for(i=0; i<8; i++){
1960         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1961         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1962         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1963         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1964
1965         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1966         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1967         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1968         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1969
1970         sum +=
1971              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1972             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1973             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1974             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1975     }
1976
1977     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
1978
1979     return sum;
1980 }
1981
1982 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1983     MpegEncContext * const s= (MpegEncContext *)c;
1984     LOCAL_ALIGNED_16(int16_t, temp, [64]);
1985
1986     assert(h==8);
1987
1988     s->dsp.diff_pixels(temp, src1, src2, stride);
1989     s->dsp.fdct(temp);
1990     return s->dsp.sum_abs_dctelem(temp);
1991 }
1992
1993 #if CONFIG_GPL
1994 #define DCT8_1D {\
1995     const int s07 = SRC(0) + SRC(7);\
1996     const int s16 = SRC(1) + SRC(6);\
1997     const int s25 = SRC(2) + SRC(5);\
1998     const int s34 = SRC(3) + SRC(4);\
1999     const int a0 = s07 + s34;\
2000     const int a1 = s16 + s25;\
2001     const int a2 = s07 - s34;\
2002     const int a3 = s16 - s25;\
2003     const int d07 = SRC(0) - SRC(7);\
2004     const int d16 = SRC(1) - SRC(6);\
2005     const int d25 = SRC(2) - SRC(5);\
2006     const int d34 = SRC(3) - SRC(4);\
2007     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2008     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2009     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2010     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2011     DST(0,  a0 + a1     ) ;\
2012     DST(1,  a4 + (a7>>2)) ;\
2013     DST(2,  a2 + (a3>>1)) ;\
2014     DST(3,  a5 + (a6>>2)) ;\
2015     DST(4,  a0 - a1     ) ;\
2016     DST(5,  a6 - (a5>>2)) ;\
2017     DST(6, (a2>>1) - a3 ) ;\
2018     DST(7, (a4>>2) - a7 ) ;\
2019 }
2020
2021 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2022     MpegEncContext * const s= (MpegEncContext *)c;
2023     int16_t dct[8][8];
2024     int i;
2025     int sum=0;
2026
2027     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2028
2029 #define SRC(x) dct[i][x]
2030 #define DST(x,v) dct[i][x]= v
2031     for( i = 0; i < 8; i++ )
2032         DCT8_1D
2033 #undef SRC
2034 #undef DST
2035
2036 #define SRC(x) dct[x][i]
2037 #define DST(x,v) sum += FFABS(v)
2038     for( i = 0; i < 8; i++ )
2039         DCT8_1D
2040 #undef SRC
2041 #undef DST
2042     return sum;
2043 }
2044 #endif
2045
2046 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2047     MpegEncContext * const s= (MpegEncContext *)c;
2048     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2049     int sum=0, i;
2050
2051     assert(h==8);
2052
2053     s->dsp.diff_pixels(temp, src1, src2, stride);
2054     s->dsp.fdct(temp);
2055
2056     for(i=0; i<64; i++)
2057         sum= FFMAX(sum, FFABS(temp[i]));
2058
2059     return sum;
2060 }
2061
2062 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2063     MpegEncContext * const s= (MpegEncContext *)c;
2064     LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2065     int16_t * const bak = temp+64;
2066     int sum=0, i;
2067
2068     assert(h==8);
2069     s->mb_intra=0;
2070
2071     s->dsp.diff_pixels(temp, src1, src2, stride);
2072
2073     memcpy(bak, temp, 64*sizeof(int16_t));
2074
2075     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2076     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2077     ff_simple_idct_8(temp); //FIXME
2078
2079     for(i=0; i<64; i++)
2080         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2081
2082     return sum;
2083 }
2084
2085 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2086     MpegEncContext * const s= (MpegEncContext *)c;
2087     const uint8_t *scantable= s->intra_scantable.permutated;
2088     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2089     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2090     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2091     int i, last, run, bits, level, distortion, start_i;
2092     const int esc_length= s->ac_esc_length;
2093     uint8_t * length;
2094     uint8_t * last_length;
2095
2096     assert(h==8);
2097
2098     copy_block8(lsrc1, src1, 8, stride, 8);
2099     copy_block8(lsrc2, src2, 8, stride, 8);
2100
2101     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2102
2103     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2104
2105     bits=0;
2106
2107     if (s->mb_intra) {
2108         start_i = 1;
2109         length     = s->intra_ac_vlc_length;
2110         last_length= s->intra_ac_vlc_last_length;
2111         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2112     } else {
2113         start_i = 0;
2114         length     = s->inter_ac_vlc_length;
2115         last_length= s->inter_ac_vlc_last_length;
2116     }
2117
2118     if(last>=start_i){
2119         run=0;
2120         for(i=start_i; i<last; i++){
2121             int j= scantable[i];
2122             level= temp[j];
2123
2124             if(level){
2125                 level+=64;
2126                 if((level&(~127)) == 0){
2127                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2128                 }else
2129                     bits+= esc_length;
2130                 run=0;
2131             }else
2132                 run++;
2133         }
2134         i= scantable[last];
2135
2136         level= temp[i] + 64;
2137
2138         assert(level - 64);
2139
2140         if((level&(~127)) == 0){
2141             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2142         }else
2143             bits+= esc_length;
2144
2145     }
2146
2147     if(last>=0){
2148         if(s->mb_intra)
2149             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2150         else
2151             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2152     }
2153
2154     s->dsp.idct_add(lsrc2, 8, temp);
2155
2156     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2157
2158     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2159 }
2160
2161 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2162     MpegEncContext * const s= (MpegEncContext *)c;
2163     const uint8_t *scantable= s->intra_scantable.permutated;
2164     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2165     int i, last, run, bits, level, start_i;
2166     const int esc_length= s->ac_esc_length;
2167     uint8_t * length;
2168     uint8_t * last_length;
2169
2170     assert(h==8);
2171
2172     s->dsp.diff_pixels(temp, src1, src2, stride);
2173
2174     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2175
2176     bits=0;
2177
2178     if (s->mb_intra) {
2179         start_i = 1;
2180         length     = s->intra_ac_vlc_length;
2181         last_length= s->intra_ac_vlc_last_length;
2182         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2183     } else {
2184         start_i = 0;
2185         length     = s->inter_ac_vlc_length;
2186         last_length= s->inter_ac_vlc_last_length;
2187     }
2188
2189     if(last>=start_i){
2190         run=0;
2191         for(i=start_i; i<last; i++){
2192             int j= scantable[i];
2193             level= temp[j];
2194
2195             if(level){
2196                 level+=64;
2197                 if((level&(~127)) == 0){
2198                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2199                 }else
2200                     bits+= esc_length;
2201                 run=0;
2202             }else
2203                 run++;
2204         }
2205         i= scantable[last];
2206
2207         level= temp[i] + 64;
2208
2209         assert(level - 64);
2210
2211         if((level&(~127)) == 0){
2212             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2213         }else
2214             bits+= esc_length;
2215     }
2216
2217     return bits;
2218 }
2219
2220 #define VSAD_INTRA(size) \
2221 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2222     int score=0;                                                                                            \
2223     int x,y;                                                                                                \
2224                                                                                                             \
2225     for(y=1; y<h; y++){                                                                                     \
2226         for(x=0; x<size; x+=4){                                                                             \
2227             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2228                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2229         }                                                                                                   \
2230         s+= stride;                                                                                         \
2231     }                                                                                                       \
2232                                                                                                             \
2233     return score;                                                                                           \
2234 }
2235 VSAD_INTRA(8)
2236 VSAD_INTRA(16)
2237
2238 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2239     int score=0;
2240     int x,y;
2241
2242     for(y=1; y<h; y++){
2243         for(x=0; x<16; x++){
2244             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2245         }
2246         s1+= stride;
2247         s2+= stride;
2248     }
2249
2250     return score;
2251 }
2252
2253 #define SQ(a) ((a)*(a))
2254 #define VSSE_INTRA(size) \
2255 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2256     int score=0;                                                                                            \
2257     int x,y;                                                                                                \
2258                                                                                                             \
2259     for(y=1; y<h; y++){                                                                                     \
2260         for(x=0; x<size; x+=4){                                                                               \
2261             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2262                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2263         }                                                                                                   \
2264         s+= stride;                                                                                         \
2265     }                                                                                                       \
2266                                                                                                             \
2267     return score;                                                                                           \
2268 }
2269 VSSE_INTRA(8)
2270 VSSE_INTRA(16)
2271
2272 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2273     int score=0;
2274     int x,y;
2275
2276     for(y=1; y<h; y++){
2277         for(x=0; x<16; x++){
2278             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2279         }
2280         s1+= stride;
2281         s2+= stride;
2282     }
2283
2284     return score;
2285 }
2286
2287 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2288                                int size){
2289     int score=0;
2290     int i;
2291     for(i=0; i<size; i++)
2292         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2293     return score;
2294 }
2295
2296 #define WRAPPER8_16_SQ(name8, name16)\
2297 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2298     int score=0;\
2299     score +=name8(s, dst           , src           , stride, 8);\
2300     score +=name8(s, dst+8         , src+8         , stride, 8);\
2301     if(h==16){\
2302         dst += 8*stride;\
2303         src += 8*stride;\
2304         score +=name8(s, dst           , src           , stride, 8);\
2305         score +=name8(s, dst+8         , src+8         , stride, 8);\
2306     }\
2307     return score;\
2308 }
2309
2310 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2311 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2312 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2313 #if CONFIG_GPL
2314 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2315 #endif
2316 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2317 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2318 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2319 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2320
2321 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2322                    uint32_t maxi, uint32_t maxisign)
2323 {
2324
2325     if(a > mini) return mini;
2326     else if((a^(1U<<31)) > maxisign) return maxi;
2327     else return a;
2328 }
2329
2330 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2331     int i;
2332     uint32_t mini = *(uint32_t*)min;
2333     uint32_t maxi = *(uint32_t*)max;
2334     uint32_t maxisign = maxi ^ (1U<<31);
2335     uint32_t *dsti = (uint32_t*)dst;
2336     const uint32_t *srci = (const uint32_t*)src;
2337     for(i=0; i<len; i+=8) {
2338         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2339         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2340         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2341         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2342         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2343         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2344         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2345         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2346     }
2347 }
2348 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2349     int i;
2350     if(min < 0 && max > 0) {
2351         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2352     } else {
2353         for(i=0; i < len; i+=8) {
2354             dst[i    ] = av_clipf(src[i    ], min, max);
2355             dst[i + 1] = av_clipf(src[i + 1], min, max);
2356             dst[i + 2] = av_clipf(src[i + 2], min, max);
2357             dst[i + 3] = av_clipf(src[i + 3], min, max);
2358             dst[i + 4] = av_clipf(src[i + 4], min, max);
2359             dst[i + 5] = av_clipf(src[i + 5], min, max);
2360             dst[i + 6] = av_clipf(src[i + 6], min, max);
2361             dst[i + 7] = av_clipf(src[i + 7], min, max);
2362         }
2363     }
2364 }
2365
2366 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2367 {
2368     int res = 0;
2369
2370     while (order--)
2371         res += *v1++ * *v2++;
2372
2373     return res;
2374 }
2375
2376 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2377 {
2378     int res = 0;
2379     while (order--) {
2380         res   += *v1 * *v2++;
2381         *v1++ += mul * *v3++;
2382     }
2383     return res;
2384 }
2385
2386 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2387                                  const int16_t *window, unsigned int len)
2388 {
2389     int i;
2390     int len2 = len >> 1;
2391
2392     for (i = 0; i < len2; i++) {
2393         int16_t w       = window[i];
2394         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2395         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2396     }
2397 }
2398
2399 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2400                                 int32_t max, unsigned int len)
2401 {
2402     do {
2403         *dst++ = av_clip(*src++, min, max);
2404         *dst++ = av_clip(*src++, min, max);
2405         *dst++ = av_clip(*src++, min, max);
2406         *dst++ = av_clip(*src++, min, max);
2407         *dst++ = av_clip(*src++, min, max);
2408         *dst++ = av_clip(*src++, min, max);
2409         *dst++ = av_clip(*src++, min, max);
2410         *dst++ = av_clip(*src++, min, max);
2411         len -= 8;
2412     } while (len > 0);
2413 }
2414
2415 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2416 {
2417     ff_j_rev_dct (block);
2418     put_pixels_clamped_c(block, dest, line_size);
2419 }
2420 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2421 {
2422     ff_j_rev_dct (block);
2423     add_pixels_clamped_c(block, dest, line_size);
2424 }
2425
2426 /* init static data */
2427 av_cold void ff_dsputil_static_init(void)
2428 {
2429     int i;
2430
2431     for(i=0;i<512;i++) {
2432         ff_squareTbl[i] = (i - 256) * (i - 256);
2433     }
2434
2435     for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2436 }
2437
2438 int ff_check_alignment(void){
2439     static int did_fail=0;
2440     LOCAL_ALIGNED_16(int, aligned, [4]);
2441
2442     if((intptr_t)aligned & 15){
2443         if(!did_fail){
2444 #if HAVE_MMX || HAVE_ALTIVEC
2445             av_log(NULL, AV_LOG_ERROR,
2446                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2447                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2448                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2449                 "Do not report crashes to Libav developers.\n");
2450 #endif
2451             did_fail=1;
2452         }
2453         return -1;
2454     }
2455     return 0;
2456 }
2457
2458 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2459 {
2460     ff_check_alignment();
2461
2462 #if CONFIG_ENCODERS
2463     if (avctx->bits_per_raw_sample == 10) {
2464         c->fdct    = ff_jpeg_fdct_islow_10;
2465         c->fdct248 = ff_fdct248_islow_10;
2466     } else {
2467         if(avctx->dct_algo==FF_DCT_FASTINT) {
2468             c->fdct    = ff_fdct_ifast;
2469             c->fdct248 = ff_fdct_ifast248;
2470         }
2471         else if(avctx->dct_algo==FF_DCT_FAAN) {
2472             c->fdct    = ff_faandct;
2473             c->fdct248 = ff_faandct248;
2474         }
2475         else {
2476             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2477             c->fdct248 = ff_fdct248_islow_8;
2478         }
2479     }
2480 #endif //CONFIG_ENCODERS
2481
2482     if (avctx->bits_per_raw_sample == 10) {
2483         c->idct_put              = ff_simple_idct_put_10;
2484         c->idct_add              = ff_simple_idct_add_10;
2485         c->idct                  = ff_simple_idct_10;
2486         c->idct_permutation_type = FF_NO_IDCT_PERM;
2487     } else {
2488         if(avctx->idct_algo==FF_IDCT_INT){
2489             c->idct_put= jref_idct_put;
2490             c->idct_add= jref_idct_add;
2491             c->idct    = ff_j_rev_dct;
2492             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2493         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2494             c->idct_put= ff_faanidct_put;
2495             c->idct_add= ff_faanidct_add;
2496             c->idct    = ff_faanidct;
2497             c->idct_permutation_type= FF_NO_IDCT_PERM;
2498         }else{ //accurate/default
2499             c->idct_put = ff_simple_idct_put_8;
2500             c->idct_add = ff_simple_idct_add_8;
2501             c->idct     = ff_simple_idct_8;
2502             c->idct_permutation_type= FF_NO_IDCT_PERM;
2503         }
2504     }
2505
2506     c->diff_pixels = diff_pixels_c;
2507     c->put_pixels_clamped = put_pixels_clamped_c;
2508     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2509     c->add_pixels_clamped = add_pixels_clamped_c;
2510     c->sum_abs_dctelem = sum_abs_dctelem_c;
2511     c->gmc1 = gmc1_c;
2512     c->gmc = ff_gmc_c;
2513     c->pix_sum = pix_sum_c;
2514     c->pix_norm1 = pix_norm1_c;
2515
2516     c->fill_block_tab[0] = fill_block16_c;
2517     c->fill_block_tab[1] = fill_block8_c;
2518
2519     /* TODO [0] 16  [1] 8 */
2520     c->pix_abs[0][0] = pix_abs16_c;
2521     c->pix_abs[0][1] = pix_abs16_x2_c;
2522     c->pix_abs[0][2] = pix_abs16_y2_c;
2523     c->pix_abs[0][3] = pix_abs16_xy2_c;
2524     c->pix_abs[1][0] = pix_abs8_c;
2525     c->pix_abs[1][1] = pix_abs8_x2_c;
2526     c->pix_abs[1][2] = pix_abs8_y2_c;
2527     c->pix_abs[1][3] = pix_abs8_xy2_c;
2528
2529     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2530     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2531     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2532     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2533     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2534     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2535     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2536     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2537     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2538
2539     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2540     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2541     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2542     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2543     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2544     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2545     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2546     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2547     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2548
2549 #define dspfunc(PFX, IDX, NUM) \
2550     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2551     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2552     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2553     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2554     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2555     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2556     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2557     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2558     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2559     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2560     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2561     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2562     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2563     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2564     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2565     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2566
2567     dspfunc(put_qpel, 0, 16);
2568     dspfunc(put_no_rnd_qpel, 0, 16);
2569
2570     dspfunc(avg_qpel, 0, 16);
2571     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2572
2573     dspfunc(put_qpel, 1, 8);
2574     dspfunc(put_no_rnd_qpel, 1, 8);
2575
2576     dspfunc(avg_qpel, 1, 8);
2577     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2578
2579 #undef dspfunc
2580
2581     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2582     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2583     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2584     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2585     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2586     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2587     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2588     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2589
2590 #define SET_CMP_FUNC(name) \
2591     c->name[0]= name ## 16_c;\
2592     c->name[1]= name ## 8x8_c;
2593
2594     SET_CMP_FUNC(hadamard8_diff)
2595     c->hadamard8_diff[4]= hadamard8_intra16_c;
2596     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2597     SET_CMP_FUNC(dct_sad)
2598     SET_CMP_FUNC(dct_max)
2599 #if CONFIG_GPL
2600     SET_CMP_FUNC(dct264_sad)
2601 #endif
2602     c->sad[0]= pix_abs16_c;
2603     c->sad[1]= pix_abs8_c;
2604     c->sse[0]= sse16_c;
2605     c->sse[1]= sse8_c;
2606     c->sse[2]= sse4_c;
2607     SET_CMP_FUNC(quant_psnr)
2608     SET_CMP_FUNC(rd)
2609     SET_CMP_FUNC(bit)
2610     c->vsad[0]= vsad16_c;
2611     c->vsad[4]= vsad_intra16_c;
2612     c->vsad[5]= vsad_intra8_c;
2613     c->vsse[0]= vsse16_c;
2614     c->vsse[4]= vsse_intra16_c;
2615     c->vsse[5]= vsse_intra8_c;
2616     c->nsse[0]= nsse16_c;
2617     c->nsse[1]= nsse8_c;
2618
2619     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2620
2621     c->add_bytes= add_bytes_c;
2622     c->diff_bytes= diff_bytes_c;
2623     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2624     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2625     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
2626     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2627     c->bswap_buf= bswap_buf;
2628     c->bswap16_buf = bswap16_buf;
2629
2630     c->try_8x8basis= try_8x8basis_c;
2631     c->add_8x8basis= add_8x8basis_c;
2632
2633     c->vector_clipf = vector_clipf_c;
2634     c->scalarproduct_int16 = scalarproduct_int16_c;
2635     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2636     c->apply_window_int16 = apply_window_int16_c;
2637     c->vector_clip_int32 = vector_clip_int32_c;
2638
2639     c->shrink[0]= av_image_copy_plane;
2640     c->shrink[1]= ff_shrink22;
2641     c->shrink[2]= ff_shrink44;
2642     c->shrink[3]= ff_shrink88;
2643
2644     c->add_pixels8 = add_pixels8_c;
2645
2646 #undef FUNC
2647 #undef FUNCC
2648 #define FUNC(f, depth) f ## _ ## depth
2649 #define FUNCC(f, depth) f ## _ ## depth ## _c
2650
2651     c->draw_edges                    = FUNCC(draw_edges, 8);
2652     c->clear_block                   = FUNCC(clear_block, 8);
2653     c->clear_blocks                  = FUNCC(clear_blocks, 8);
2654
2655 #define BIT_DEPTH_FUNCS(depth) \
2656     c->get_pixels                    = FUNCC(get_pixels,   depth);
2657
2658     switch (avctx->bits_per_raw_sample) {
2659     case 9:
2660     case 10:
2661         BIT_DEPTH_FUNCS(16);
2662         break;
2663     default:
2664         BIT_DEPTH_FUNCS(8);
2665         break;
2666     }
2667
2668
2669     if (ARCH_ARM)
2670         ff_dsputil_init_arm(c, avctx);
2671     if (ARCH_BFIN)
2672         ff_dsputil_init_bfin(c, avctx);
2673     if (ARCH_PPC)
2674         ff_dsputil_init_ppc(c, avctx);
2675     if (ARCH_SH4)
2676         ff_dsputil_init_sh4(c, avctx);
2677     if (HAVE_VIS)
2678         ff_dsputil_init_vis(c, avctx);
2679     if (ARCH_X86)
2680         ff_dsputil_init_x86(c, avctx);
2681
2682     ff_init_scantable_permutation(c->idct_permutation,
2683                                   c->idct_permutation_type);
2684 }