git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/attributes.h"
  31 #include "libavutil/imgutils.h"
  32 #include "libavutil/internal.h"
  33 #include "avcodec.h"
  34 #include "copy_block.h"
  35 #include "dct.h"
  36 #include "dsputil.h"
  37 #include "simple_idct.h"
  38 #include "faandct.h"
  39 #include "faanidct.h"
  40 #include "imgconvert.h"
  41 #include "mathops.h"
  42 #include "mpegvideo.h"
  43 #include "config.h"
  44
  45 uint32_t ff_squareTbl[512] = {0, };
  46
  47 #define BIT_DEPTH 16
  48 #include "dsputil_template.c"
  49 #undef BIT_DEPTH
  50
  51 #define BIT_DEPTH 8
  52 #include "dsputil_template.c"
  53
  54 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  55 #define pb_7f (~0UL/255 * 0x7f)
  56 #define pb_80 (~0UL/255 * 0x80)
  57
  58 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  59    specification, we interleave the fields */
  60 const uint8_t ff_zigzag248_direct[64] = {
  61      0,  8,  1,  9, 16, 24,  2, 10,
  62     17, 25, 32, 40, 48, 56, 33, 41,
  63     18, 26,  3, 11,  4, 12, 19, 27,
  64     34, 42, 49, 57, 50, 58, 35, 43,
  65     20, 28,  5, 13,  6, 14, 21, 29,
  66     36, 44, 51, 59, 52, 60, 37, 45,
  67     22, 30,  7, 15, 23, 31, 38, 46,
  68     53, 61, 54, 62, 39, 47, 55, 63,
  69 };
  70
  71 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  72 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
  73
  74 const uint8_t ff_alternate_horizontal_scan[64] = {
  75     0,  1,   2,  3,  8,  9, 16, 17,
  76     10, 11,  4,  5,  6,  7, 15, 14,
  77     13, 12, 19, 18, 24, 25, 32, 33,
  78     26, 27, 20, 21, 22, 23, 28, 29,
  79     30, 31, 34, 35, 40, 41, 48, 49,
  80     42, 43, 36, 37, 38, 39, 44, 45,
  81     46, 47, 50, 51, 56, 57, 58, 59,
  82     52, 53, 54, 55, 60, 61, 62, 63,
  83 };
  84
  85 const uint8_t ff_alternate_vertical_scan[64] = {
  86     0,  8,  16, 24,  1,  9,  2, 10,
  87     17, 25, 32, 40, 48, 56, 57, 49,
  88     41, 33, 26, 18,  3, 11,  4, 12,
  89     19, 27, 34, 42, 50, 58, 35, 43,
  90     51, 59, 20, 28,  5, 13,  6, 14,
  91     21, 29, 36, 44, 52, 60, 37, 45,
  92     53, 61, 22, 30,  7, 15, 23, 31,
  93     38, 46, 54, 62, 39, 47, 55, 63,
  94 };
  95
  96 /* Input permutation for the simple_idct_mmx */
  97 static const uint8_t simple_mmx_permutation[64]={
  98         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
  99         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 100         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 101         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 102         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 103         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 104         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 105         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 106 };
 107
 108 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 109
 110 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
 111                                const uint8_t *src_scantable)
 112 {
 113     int i;
 114     int end;
 115
 116     st->scantable= src_scantable;
 117
 118     for(i=0; i<64; i++){
 119         int j;
 120         j = src_scantable[i];
 121         st->permutated[i] = permutation[j];
 122     }
 123
 124     end=-1;
 125     for(i=0; i<64; i++){
 126         int j;
 127         j = st->permutated[i];
 128         if(j>end) end=j;
 129         st->raster_end[i]= end;
 130     }
 131 }
 132
 133 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
 134                                            int idct_permutation_type)
 135 {
 136     int i;
 137
 138     switch(idct_permutation_type){
 139     case FF_NO_IDCT_PERM:
 140         for(i=0; i<64; i++)
 141             idct_permutation[i]= i;
 142         break;
 143     case FF_LIBMPEG2_IDCT_PERM:
 144         for(i=0; i<64; i++)
 145             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 146         break;
 147     case FF_SIMPLE_IDCT_PERM:
 148         for(i=0; i<64; i++)
 149             idct_permutation[i]= simple_mmx_permutation[i];
 150         break;
 151     case FF_TRANSPOSE_IDCT_PERM:
 152         for(i=0; i<64; i++)
 153             idct_permutation[i]= ((i&7)<<3) | (i>>3);
 154         break;
 155     case FF_PARTTRANS_IDCT_PERM:
 156         for(i=0; i<64; i++)
 157             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
 158         break;
 159     case FF_SSE2_IDCT_PERM:
 160         for(i=0; i<64; i++)
 161             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
 162         break;
 163     default:
 164         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
 165     }
 166 }
 167
 168 static int pix_sum_c(uint8_t * pix, int line_size)
 169 {
 170     int s, i, j;
 171
 172     s = 0;
 173     for (i = 0; i < 16; i++) {
 174         for (j = 0; j < 16; j += 8) {
 175             s += pix[0];
 176             s += pix[1];
 177             s += pix[2];
 178             s += pix[3];
 179             s += pix[4];
 180             s += pix[5];
 181             s += pix[6];
 182             s += pix[7];
 183             pix += 8;
 184         }
 185         pix += line_size - 16;
 186     }
 187     return s;
 188 }
 189
 190 static int pix_norm1_c(uint8_t * pix, int line_size)
 191 {
 192     int s, i, j;
 193     uint32_t *sq = ff_squareTbl + 256;
 194
 195     s = 0;
 196     for (i = 0; i < 16; i++) {
 197         for (j = 0; j < 16; j += 8) {
 198 #if 0
 199             s += sq[pix[0]];
 200             s += sq[pix[1]];
 201             s += sq[pix[2]];
 202             s += sq[pix[3]];
 203             s += sq[pix[4]];
 204             s += sq[pix[5]];
 205             s += sq[pix[6]];
 206             s += sq[pix[7]];
 207 #else
 208 #if HAVE_FAST_64BIT
 209             register uint64_t x=*(uint64_t*)pix;
 210             s += sq[x&0xff];
 211             s += sq[(x>>8)&0xff];
 212             s += sq[(x>>16)&0xff];
 213             s += sq[(x>>24)&0xff];
 214             s += sq[(x>>32)&0xff];
 215             s += sq[(x>>40)&0xff];
 216             s += sq[(x>>48)&0xff];
 217             s += sq[(x>>56)&0xff];
 218 #else
 219             register uint32_t x=*(uint32_t*)pix;
 220             s += sq[x&0xff];
 221             s += sq[(x>>8)&0xff];
 222             s += sq[(x>>16)&0xff];
 223             s += sq[(x>>24)&0xff];
 224             x=*(uint32_t*)(pix+4);
 225             s += sq[x&0xff];
 226             s += sq[(x>>8)&0xff];
 227             s += sq[(x>>16)&0xff];
 228             s += sq[(x>>24)&0xff];
 229 #endif
 230 #endif
 231             pix += 8;
 232         }
 233         pix += line_size - 16;
 234     }
 235     return s;
 236 }
 237
 238 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 239     int i;
 240
 241     for(i=0; i+8<=w; i+=8){
 242         dst[i+0]= av_bswap32(src[i+0]);
 243         dst[i+1]= av_bswap32(src[i+1]);
 244         dst[i+2]= av_bswap32(src[i+2]);
 245         dst[i+3]= av_bswap32(src[i+3]);
 246         dst[i+4]= av_bswap32(src[i+4]);
 247         dst[i+5]= av_bswap32(src[i+5]);
 248         dst[i+6]= av_bswap32(src[i+6]);
 249         dst[i+7]= av_bswap32(src[i+7]);
 250     }
 251     for(;i<w; i++){
 252         dst[i+0]= av_bswap32(src[i+0]);
 253     }
 254 }
 255
 256 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 257 {
 258     while (len--)
 259         *dst++ = av_bswap16(*src++);
 260 }
 261
 262 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 263 {
 264     int s, i;
 265     uint32_t *sq = ff_squareTbl + 256;
 266
 267     s = 0;
 268     for (i = 0; i < h; i++) {
 269         s += sq[pix1[0] - pix2[0]];
 270         s += sq[pix1[1] - pix2[1]];
 271         s += sq[pix1[2] - pix2[2]];
 272         s += sq[pix1[3] - pix2[3]];
 273         pix1 += line_size;
 274         pix2 += line_size;
 275     }
 276     return s;
 277 }
 278
 279 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 280 {
 281     int s, i;
 282     uint32_t *sq = ff_squareTbl + 256;
 283
 284     s = 0;
 285     for (i = 0; i < h; i++) {
 286         s += sq[pix1[0] - pix2[0]];
 287         s += sq[pix1[1] - pix2[1]];
 288         s += sq[pix1[2] - pix2[2]];
 289         s += sq[pix1[3] - pix2[3]];
 290         s += sq[pix1[4] - pix2[4]];
 291         s += sq[pix1[5] - pix2[5]];
 292         s += sq[pix1[6] - pix2[6]];
 293         s += sq[pix1[7] - pix2[7]];
 294         pix1 += line_size;
 295         pix2 += line_size;
 296     }
 297     return s;
 298 }
 299
 300 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 301 {
 302     int s, i;
 303     uint32_t *sq = ff_squareTbl + 256;
 304
 305     s = 0;
 306     for (i = 0; i < h; i++) {
 307         s += sq[pix1[ 0] - pix2[ 0]];
 308         s += sq[pix1[ 1] - pix2[ 1]];
 309         s += sq[pix1[ 2] - pix2[ 2]];
 310         s += sq[pix1[ 3] - pix2[ 3]];
 311         s += sq[pix1[ 4] - pix2[ 4]];
 312         s += sq[pix1[ 5] - pix2[ 5]];
 313         s += sq[pix1[ 6] - pix2[ 6]];
 314         s += sq[pix1[ 7] - pix2[ 7]];
 315         s += sq[pix1[ 8] - pix2[ 8]];
 316         s += sq[pix1[ 9] - pix2[ 9]];
 317         s += sq[pix1[10] - pix2[10]];
 318         s += sq[pix1[11] - pix2[11]];
 319         s += sq[pix1[12] - pix2[12]];
 320         s += sq[pix1[13] - pix2[13]];
 321         s += sq[pix1[14] - pix2[14]];
 322         s += sq[pix1[15] - pix2[15]];
 323
 324         pix1 += line_size;
 325         pix2 += line_size;
 326     }
 327     return s;
 328 }
 329
 330 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
 331                           const uint8_t *s2, int stride){
 332     int i;
 333
 334     /* read the pixels */
 335     for(i=0;i<8;i++) {
 336         block[0] = s1[0] - s2[0];
 337         block[1] = s1[1] - s2[1];
 338         block[2] = s1[2] - s2[2];
 339         block[3] = s1[3] - s2[3];
 340         block[4] = s1[4] - s2[4];
 341         block[5] = s1[5] - s2[5];
 342         block[6] = s1[6] - s2[6];
 343         block[7] = s1[7] - s2[7];
 344         s1 += stride;
 345         s2 += stride;
 346         block += 8;
 347     }
 348 }
 349
 350
 351 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 352                                  int line_size)
 353 {
 354     int i;
 355
 356     /* read the pixels */
 357     for(i=0;i<8;i++) {
 358         pixels[0] = av_clip_uint8(block[0]);
 359         pixels[1] = av_clip_uint8(block[1]);
 360         pixels[2] = av_clip_uint8(block[2]);
 361         pixels[3] = av_clip_uint8(block[3]);
 362         pixels[4] = av_clip_uint8(block[4]);
 363         pixels[5] = av_clip_uint8(block[5]);
 364         pixels[6] = av_clip_uint8(block[6]);
 365         pixels[7] = av_clip_uint8(block[7]);
 366
 367         pixels += line_size;
 368         block += 8;
 369     }
 370 }
 371
 372 static void put_signed_pixels_clamped_c(const int16_t *block,
 373                                         uint8_t *restrict pixels,
 374                                         int line_size)
 375 {
 376     int i, j;
 377
 378     for (i = 0; i < 8; i++) {
 379         for (j = 0; j < 8; j++) {
 380             if (*block < -128)
 381                 *pixels = 0;
 382             else if (*block > 127)
 383                 *pixels = 255;
 384             else
 385                 *pixels = (uint8_t)(*block + 128);
 386             block++;
 387             pixels++;
 388         }
 389         pixels += (line_size - 8);
 390     }
 391 }
 392
 393 static void add_pixels8_c(uint8_t *restrict pixels,
 394                           int16_t *block,
 395                           int line_size)
 396 {
 397     int i;
 398
 399     for(i=0;i<8;i++) {
 400         pixels[0] += block[0];
 401         pixels[1] += block[1];
 402         pixels[2] += block[2];
 403         pixels[3] += block[3];
 404         pixels[4] += block[4];
 405         pixels[5] += block[5];
 406         pixels[6] += block[6];
 407         pixels[7] += block[7];
 408         pixels += line_size;
 409         block += 8;
 410     }
 411 }
 412
 413 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 414                                  int line_size)
 415 {
 416     int i;
 417
 418     /* read the pixels */
 419     for(i=0;i<8;i++) {
 420         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 421         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 422         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 423         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 424         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
 425         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
 426         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
 427         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
 428         pixels += line_size;
 429         block += 8;
 430     }
 431 }
 432
 433 static int sum_abs_dctelem_c(int16_t *block)
 434 {
 435     int sum=0, i;
 436     for(i=0; i<64; i++)
 437         sum+= FFABS(block[i]);
 438     return sum;
 439 }
 440
 441 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 442 {
 443     int i;
 444
 445     for (i = 0; i < h; i++) {
 446         memset(block, value, 16);
 447         block += line_size;
 448     }
 449 }
 450
 451 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 452 {
 453     int i;
 454
 455     for (i = 0; i < h; i++) {
 456         memset(block, value, 8);
 457         block += line_size;
 458     }
 459 }
 460
 461 #define avg2(a,b) ((a+b+1)>>1)
 462 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 463
 464 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 465 {
 466     const int A=(16-x16)*(16-y16);
 467     const int B=(   x16)*(16-y16);
 468     const int C=(16-x16)*(   y16);
 469     const int D=(   x16)*(   y16);
 470     int i;
 471
 472     for(i=0; i<h; i++)
 473     {
 474         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 475         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 476         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 477         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 478         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 479         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 480         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 481         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 482         dst+= stride;
 483         src+= stride;
 484     }
 485 }
 486
 487 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 488                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 489 {
 490     int y, vx, vy;
 491     const int s= 1<<shift;
 492
 493     width--;
 494     height--;
 495
 496     for(y=0; y<h; y++){
 497         int x;
 498
 499         vx= ox;
 500         vy= oy;
 501         for(x=0; x<8; x++){ //XXX FIXME optimize
 502             int src_x, src_y, frac_x, frac_y, index;
 503
 504             src_x= vx>>16;
 505             src_y= vy>>16;
 506             frac_x= src_x&(s-1);
 507             frac_y= src_y&(s-1);
 508             src_x>>=shift;
 509             src_y>>=shift;
 510
 511             if((unsigned)src_x < width){
 512                 if((unsigned)src_y < height){
 513                     index= src_x + src_y*stride;
 514                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 515                                            + src[index       +1]*   frac_x )*(s-frac_y)
 516                                         + (  src[index+stride  ]*(s-frac_x)
 517                                            + src[index+stride+1]*   frac_x )*   frac_y
 518                                         + r)>>(shift*2);
 519                 }else{
 520                     index= src_x + av_clip(src_y, 0, height)*stride;
 521                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 522                                           + src[index       +1]*   frac_x )*s
 523                                         + r)>>(shift*2);
 524                 }
 525             }else{
 526                 if((unsigned)src_y < height){
 527                     index= av_clip(src_x, 0, width) + src_y*stride;
 528                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 529                                            + src[index+stride  ]*   frac_y )*s
 530                                         + r)>>(shift*2);
 531                 }else{
 532                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 533                     dst[y*stride + x]=    src[index         ];
 534                 }
 535             }
 536
 537             vx+= dxx;
 538             vy+= dyx;
 539         }
 540         ox += dxy;
 541         oy += dyy;
 542     }
 543 }
 544
 545 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 546     switch(width){
 547     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 548     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 549     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 550     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 551     }
 552 }
 553
 554 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 555     int i,j;
 556     for (i=0; i < height; i++) {
 557       for (j=0; j < width; j++) {
 558         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 559       }
 560       src += stride;
 561       dst += stride;
 562     }
 563 }
 564
 565 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 566     int i,j;
 567     for (i=0; i < height; i++) {
 568       for (j=0; j < width; j++) {
 569         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 570       }
 571       src += stride;
 572       dst += stride;
 573     }
 574 }
 575
 576 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 577     int i,j;
 578     for (i=0; i < height; i++) {
 579       for (j=0; j < width; j++) {
 580         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 581       }
 582       src += stride;
 583       dst += stride;
 584     }
 585 }
 586
 587 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 588     int i,j;
 589     for (i=0; i < height; i++) {
 590       for (j=0; j < width; j++) {
 591         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 592       }
 593       src += stride;
 594       dst += stride;
 595     }
 596 }
 597
 598 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 599     int i,j;
 600     for (i=0; i < height; i++) {
 601       for (j=0; j < width; j++) {
 602         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 603       }
 604       src += stride;
 605       dst += stride;
 606     }
 607 }
 608
 609 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 610     int i,j;
 611     for (i=0; i < height; i++) {
 612       for (j=0; j < width; j++) {
 613         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 614       }
 615       src += stride;
 616       dst += stride;
 617     }
 618 }
 619
 620 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 621     int i,j;
 622     for (i=0; i < height; i++) {
 623       for (j=0; j < width; j++) {
 624         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 625       }
 626       src += stride;
 627       dst += stride;
 628     }
 629 }
 630
 631 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 632     int i,j;
 633     for (i=0; i < height; i++) {
 634       for (j=0; j < width; j++) {
 635         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 636       }
 637       src += stride;
 638       dst += stride;
 639     }
 640 }
 641
 642 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 643     switch(width){
 644     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 645     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 646     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 647     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 648     }
 649 }
 650
 651 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 652     int i,j;
 653     for (i=0; i < height; i++) {
 654       for (j=0; j < width; j++) {
 655         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 656       }
 657       src += stride;
 658       dst += stride;
 659     }
 660 }
 661
 662 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 663     int i,j;
 664     for (i=0; i < height; i++) {
 665       for (j=0; j < width; j++) {
 666         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 667       }
 668       src += stride;
 669       dst += stride;
 670     }
 671 }
 672
 673 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 674     int i,j;
 675     for (i=0; i < height; i++) {
 676       for (j=0; j < width; j++) {
 677         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 678       }
 679       src += stride;
 680       dst += stride;
 681     }
 682 }
 683
 684 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 685     int i,j;
 686     for (i=0; i < height; i++) {
 687       for (j=0; j < width; j++) {
 688         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 689       }
 690       src += stride;
 691       dst += stride;
 692     }
 693 }
 694
 695 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 696     int i,j;
 697     for (i=0; i < height; i++) {
 698       for (j=0; j < width; j++) {
 699         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 700       }
 701       src += stride;
 702       dst += stride;
 703     }
 704 }
 705
 706 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 707     int i,j;
 708     for (i=0; i < height; i++) {
 709       for (j=0; j < width; j++) {
 710         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 711       }
 712       src += stride;
 713       dst += stride;
 714     }
 715 }
 716
 717 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 718     int i,j;
 719     for (i=0; i < height; i++) {
 720       for (j=0; j < width; j++) {
 721         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 722       }
 723       src += stride;
 724       dst += stride;
 725     }
 726 }
 727
 728 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 729     int i,j;
 730     for (i=0; i < height; i++) {
 731       for (j=0; j < width; j++) {
 732         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 733       }
 734       src += stride;
 735       dst += stride;
 736     }
 737 }
 738
 739 #define QPEL_MC(r, OPNAME, RND, OP) \
 740 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 741     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 742     int i;\
 743     for(i=0; i<h; i++)\
 744     {\
 745         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 746         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 747         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 748         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 749         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 750         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 751         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 752         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 753         dst+=dstStride;\
 754         src+=srcStride;\
 755     }\
 756 }\
 757 \
 758 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 759     const int w=8;\
 760     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 761     int i;\
 762     for(i=0; i<w; i++)\
 763     {\
 764         const int src0= src[0*srcStride];\
 765         const int src1= src[1*srcStride];\
 766         const int src2= src[2*srcStride];\
 767         const int src3= src[3*srcStride];\
 768         const int src4= src[4*srcStride];\
 769         const int src5= src[5*srcStride];\
 770         const int src6= src[6*srcStride];\
 771         const int src7= src[7*srcStride];\
 772         const int src8= src[8*srcStride];\
 773         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 774         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 775         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 776         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 777         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 778         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 779         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 780         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 781         dst++;\
 782         src++;\
 783     }\
 784 }\
 785 \
 786 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 787     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 788     int i;\
 789     \
 790     for(i=0; i<h; i++)\
 791     {\
 792         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 793         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 794         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 795         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 796         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 797         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 798         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 799         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 800         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 801         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 802         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 803         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 804         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 805         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 806         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 807         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 808         dst+=dstStride;\
 809         src+=srcStride;\
 810     }\
 811 }\
 812 \
 813 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 814     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 815     int i;\
 816     const int w=16;\
 817     for(i=0; i<w; i++)\
 818     {\
 819         const int src0= src[0*srcStride];\
 820         const int src1= src[1*srcStride];\
 821         const int src2= src[2*srcStride];\
 822         const int src3= src[3*srcStride];\
 823         const int src4= src[4*srcStride];\
 824         const int src5= src[5*srcStride];\
 825         const int src6= src[6*srcStride];\
 826         const int src7= src[7*srcStride];\
 827         const int src8= src[8*srcStride];\
 828         const int src9= src[9*srcStride];\
 829         const int src10= src[10*srcStride];\
 830         const int src11= src[11*srcStride];\
 831         const int src12= src[12*srcStride];\
 832         const int src13= src[13*srcStride];\
 833         const int src14= src[14*srcStride];\
 834         const int src15= src[15*srcStride];\
 835         const int src16= src[16*srcStride];\
 836         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 837         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 838         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 839         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 840         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 841         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 842         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 843         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 844         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 845         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 846         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 847         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 848         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 849         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 850         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 851         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 852         dst++;\
 853         src++;\
 854     }\
 855 }\
 856 \
 857 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 858 {\
 859     uint8_t half[64];\
 860     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 861     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 862 }\
 863 \
 864 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 865 {\
 866     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 867 }\
 868 \
 869 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 870 {\
 871     uint8_t half[64];\
 872     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 873     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 874 }\
 875 \
 876 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 877 {\
 878     uint8_t full[16*9];\
 879     uint8_t half[64];\
 880     copy_block9(full, src, 16, stride, 9);\
 881     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 882     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 883 }\
 884 \
 885 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 886 {\
 887     uint8_t full[16*9];\
 888     copy_block9(full, src, 16, stride, 9);\
 889     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 890 }\
 891 \
 892 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 893 {\
 894     uint8_t full[16*9];\
 895     uint8_t half[64];\
 896     copy_block9(full, src, 16, stride, 9);\
 897     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 898     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 899 }\
 900 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 901 {\
 902     uint8_t full[16*9];\
 903     uint8_t halfH[72];\
 904     uint8_t halfV[64];\
 905     uint8_t halfHV[64];\
 906     copy_block9(full, src, 16, stride, 9);\
 907     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 908     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 909     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 910     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 911 }\
 912 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 913 {\
 914     uint8_t full[16*9];\
 915     uint8_t halfH[72];\
 916     uint8_t halfHV[64];\
 917     copy_block9(full, src, 16, stride, 9);\
 918     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 919     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 920     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 921     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 922 }\
 923 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 924 {\
 925     uint8_t full[16*9];\
 926     uint8_t halfH[72];\
 927     uint8_t halfV[64];\
 928     uint8_t halfHV[64];\
 929     copy_block9(full, src, 16, stride, 9);\
 930     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 931     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 932     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 933     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 934 }\
 935 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 936 {\
 937     uint8_t full[16*9];\
 938     uint8_t halfH[72];\
 939     uint8_t halfHV[64];\
 940     copy_block9(full, src, 16, stride, 9);\
 941     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 942     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 943     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 944     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 945 }\
 946 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 947 {\
 948     uint8_t full[16*9];\
 949     uint8_t halfH[72];\
 950     uint8_t halfV[64];\
 951     uint8_t halfHV[64];\
 952     copy_block9(full, src, 16, stride, 9);\
 953     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 954     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 955     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 956     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 957 }\
 958 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 959 {\
 960     uint8_t full[16*9];\
 961     uint8_t halfH[72];\
 962     uint8_t halfHV[64];\
 963     copy_block9(full, src, 16, stride, 9);\
 964     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 965     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 966     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 967     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 968 }\
 969 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 970 {\
 971     uint8_t full[16*9];\
 972     uint8_t halfH[72];\
 973     uint8_t halfV[64];\
 974     uint8_t halfHV[64];\
 975     copy_block9(full, src, 16, stride, 9);\
 976     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
 977     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 978     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 979     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 980 }\
 981 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 982 {\
 983     uint8_t full[16*9];\
 984     uint8_t halfH[72];\
 985     uint8_t halfHV[64];\
 986     copy_block9(full, src, 16, stride, 9);\
 987     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 988     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 989     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 990     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 991 }\
 992 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 993 {\
 994     uint8_t halfH[72];\
 995     uint8_t halfHV[64];\
 996     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
 997     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 998     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 999 }\
1000 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1001 {\
1002     uint8_t halfH[72];\
1003     uint8_t halfHV[64];\
1004     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1005     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1006     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1007 }\
1008 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1009 {\
1010     uint8_t full[16*9];\
1011     uint8_t halfH[72];\
1012     uint8_t halfV[64];\
1013     uint8_t halfHV[64];\
1014     copy_block9(full, src, 16, stride, 9);\
1015     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1017     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1019 }\
1020 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1021 {\
1022     uint8_t full[16*9];\
1023     uint8_t halfH[72];\
1024     copy_block9(full, src, 16, stride, 9);\
1025     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1026     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1027     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1028 }\
1029 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1030 {\
1031     uint8_t full[16*9];\
1032     uint8_t halfH[72];\
1033     uint8_t halfV[64];\
1034     uint8_t halfHV[64];\
1035     copy_block9(full, src, 16, stride, 9);\
1036     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1037     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1038     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1039     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1040 }\
1041 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1042 {\
1043     uint8_t full[16*9];\
1044     uint8_t halfH[72];\
1045     copy_block9(full, src, 16, stride, 9);\
1046     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1047     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1048     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1049 }\
1050 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1051 {\
1052     uint8_t halfH[72];\
1053     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1054     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1055 }\
1056 \
1057 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1058 {\
1059     uint8_t half[256];\
1060     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1061     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1062 }\
1063 \
1064 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1065 {\
1066     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1067 }\
1068 \
1069 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1070 {\
1071     uint8_t half[256];\
1072     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1073     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1074 }\
1075 \
1076 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1077 {\
1078     uint8_t full[24*17];\
1079     uint8_t half[256];\
1080     copy_block17(full, src, 24, stride, 17);\
1081     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1082     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1083 }\
1084 \
1085 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1086 {\
1087     uint8_t full[24*17];\
1088     copy_block17(full, src, 24, stride, 17);\
1089     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1090 }\
1091 \
1092 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1093 {\
1094     uint8_t full[24*17];\
1095     uint8_t half[256];\
1096     copy_block17(full, src, 24, stride, 17);\
1097     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1098     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1099 }\
1100 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1101 {\
1102     uint8_t full[24*17];\
1103     uint8_t halfH[272];\
1104     uint8_t halfV[256];\
1105     uint8_t halfHV[256];\
1106     copy_block17(full, src, 24, stride, 17);\
1107     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1108     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1109     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1110     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1111 }\
1112 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1113 {\
1114     uint8_t full[24*17];\
1115     uint8_t halfH[272];\
1116     uint8_t halfHV[256];\
1117     copy_block17(full, src, 24, stride, 17);\
1118     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1119     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1120     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1121     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1122 }\
1123 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1124 {\
1125     uint8_t full[24*17];\
1126     uint8_t halfH[272];\
1127     uint8_t halfV[256];\
1128     uint8_t halfHV[256];\
1129     copy_block17(full, src, 24, stride, 17);\
1130     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1131     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1132     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1133     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1134 }\
1135 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1136 {\
1137     uint8_t full[24*17];\
1138     uint8_t halfH[272];\
1139     uint8_t halfHV[256];\
1140     copy_block17(full, src, 24, stride, 17);\
1141     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1142     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1143     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1144     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1145 }\
1146 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1147 {\
1148     uint8_t full[24*17];\
1149     uint8_t halfH[272];\
1150     uint8_t halfV[256];\
1151     uint8_t halfHV[256];\
1152     copy_block17(full, src, 24, stride, 17);\
1153     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1154     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1155     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1156     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1157 }\
1158 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1159 {\
1160     uint8_t full[24*17];\
1161     uint8_t halfH[272];\
1162     uint8_t halfHV[256];\
1163     copy_block17(full, src, 24, stride, 17);\
1164     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1165     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1166     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1167     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1168 }\
1169 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1170 {\
1171     uint8_t full[24*17];\
1172     uint8_t halfH[272];\
1173     uint8_t halfV[256];\
1174     uint8_t halfHV[256];\
1175     copy_block17(full, src, 24, stride, 17);\
1176     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1177     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1178     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1179     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1180 }\
1181 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1182 {\
1183     uint8_t full[24*17];\
1184     uint8_t halfH[272];\
1185     uint8_t halfHV[256];\
1186     copy_block17(full, src, 24, stride, 17);\
1187     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1188     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1189     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1190     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1191 }\
1192 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1193 {\
1194     uint8_t halfH[272];\
1195     uint8_t halfHV[256];\
1196     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1197     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1198     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1199 }\
1200 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1201 {\
1202     uint8_t halfH[272];\
1203     uint8_t halfHV[256];\
1204     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1205     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1206     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1207 }\
1208 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1209 {\
1210     uint8_t full[24*17];\
1211     uint8_t halfH[272];\
1212     uint8_t halfV[256];\
1213     uint8_t halfHV[256];\
1214     copy_block17(full, src, 24, stride, 17);\
1215     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1217     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1219 }\
1220 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1221 {\
1222     uint8_t full[24*17];\
1223     uint8_t halfH[272];\
1224     copy_block17(full, src, 24, stride, 17);\
1225     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1226     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1227     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1228 }\
1229 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1230 {\
1231     uint8_t full[24*17];\
1232     uint8_t halfH[272];\
1233     uint8_t halfV[256];\
1234     uint8_t halfHV[256];\
1235     copy_block17(full, src, 24, stride, 17);\
1236     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1237     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1238     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1239     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1240 }\
1241 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1242 {\
1243     uint8_t full[24*17];\
1244     uint8_t halfH[272];\
1245     copy_block17(full, src, 24, stride, 17);\
1246     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1247     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1248     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1249 }\
1250 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1251 {\
1252     uint8_t halfH[272];\
1253     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1254     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1255 }
1256
1257 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1258 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1259 #define op_put(a, b) a = cm[((b) + 16)>>5]
1260 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1261
1262 QPEL_MC(0, put_       , _       , op_put)
1263 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1264 QPEL_MC(0, avg_       , _       , op_avg)
1265 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1266 #undef op_avg
1267 #undef op_avg_no_rnd
1268 #undef op_put
1269 #undef op_put_no_rnd
1270
1271 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1272 {
1273     put_pixels8_8_c(dst, src, stride, 8);
1274 }
1275 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1276 {
1277     avg_pixels8_8_c(dst, src, stride, 8);
1278 }
1279 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1280 {
1281     put_pixels16_8_c(dst, src, stride, 16);
1282 }
1283 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1284 {
1285     avg_pixels16_8_c(dst, src, stride, 16);
1286 }
1287
1288 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1289 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1290 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1291 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1292 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1293 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1294
1295 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1296     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1297     int i;
1298
1299     for(i=0; i<h; i++){
1300         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1301         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1302         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1303         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1304         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1305         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1306         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1307         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1308         dst+=dstStride;
1309         src+=srcStride;
1310     }
1311 }
1312
1313 #if CONFIG_RV40_DECODER
1314 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1315 {
1316     put_pixels16_xy2_8_c(dst, src, stride, 16);
1317 }
1318 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1319 {
1320     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1321 }
1322 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1323 {
1324     put_pixels8_xy2_8_c(dst, src, stride, 8);
1325 }
1326 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1327 {
1328     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1329 }
1330 #endif /* CONFIG_RV40_DECODER */
1331
1332 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1333     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1334     int i;
1335
1336     for(i=0; i<w; i++){
1337         const int src_1= src[ -srcStride];
1338         const int src0 = src[0          ];
1339         const int src1 = src[  srcStride];
1340         const int src2 = src[2*srcStride];
1341         const int src3 = src[3*srcStride];
1342         const int src4 = src[4*srcStride];
1343         const int src5 = src[5*srcStride];
1344         const int src6 = src[6*srcStride];
1345         const int src7 = src[7*srcStride];
1346         const int src8 = src[8*srcStride];
1347         const int src9 = src[9*srcStride];
1348         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1349         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1350         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1351         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1352         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1353         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1354         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1355         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1356         src++;
1357         dst++;
1358     }
1359 }
1360
1361 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1362 {
1363     uint8_t half[64];
1364     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1365     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1366 }
1367
1368 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1369 {
1370     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1371 }
1372
1373 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1374 {
1375     uint8_t half[64];
1376     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1377     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1378 }
1379
1380 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1381 {
1382     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1383 }
1384
1385 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1386 {
1387     uint8_t halfH[88];
1388     uint8_t halfV[64];
1389     uint8_t halfHV[64];
1390     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1391     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1392     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1393     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1394 }
1395 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1396 {
1397     uint8_t halfH[88];
1398     uint8_t halfV[64];
1399     uint8_t halfHV[64];
1400     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1401     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1402     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1403     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1404 }
1405 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1406 {
1407     uint8_t halfH[88];
1408     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1409     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1410 }
1411
1412 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1413     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1414     int x;
1415     const int strength= ff_h263_loop_filter_strength[qscale];
1416
1417     for(x=0; x<8; x++){
1418         int d1, d2, ad1;
1419         int p0= src[x-2*stride];
1420         int p1= src[x-1*stride];
1421         int p2= src[x+0*stride];
1422         int p3= src[x+1*stride];
1423         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1424
1425         if     (d<-2*strength) d1= 0;
1426         else if(d<-  strength) d1=-2*strength - d;
1427         else if(d<   strength) d1= d;
1428         else if(d< 2*strength) d1= 2*strength - d;
1429         else                   d1= 0;
1430
1431         p1 += d1;
1432         p2 -= d1;
1433         if(p1&256) p1= ~(p1>>31);
1434         if(p2&256) p2= ~(p2>>31);
1435
1436         src[x-1*stride] = p1;
1437         src[x+0*stride] = p2;
1438
1439         ad1= FFABS(d1)>>1;
1440
1441         d2= av_clip((p0-p3)/4, -ad1, ad1);
1442
1443         src[x-2*stride] = p0 - d2;
1444         src[x+  stride] = p3 + d2;
1445     }
1446     }
1447 }
1448
1449 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1450     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1451     int y;
1452     const int strength= ff_h263_loop_filter_strength[qscale];
1453
1454     for(y=0; y<8; y++){
1455         int d1, d2, ad1;
1456         int p0= src[y*stride-2];
1457         int p1= src[y*stride-1];
1458         int p2= src[y*stride+0];
1459         int p3= src[y*stride+1];
1460         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1461
1462         if     (d<-2*strength) d1= 0;
1463         else if(d<-  strength) d1=-2*strength - d;
1464         else if(d<   strength) d1= d;
1465         else if(d< 2*strength) d1= 2*strength - d;
1466         else                   d1= 0;
1467
1468         p1 += d1;
1469         p2 -= d1;
1470         if(p1&256) p1= ~(p1>>31);
1471         if(p2&256) p2= ~(p2>>31);
1472
1473         src[y*stride-1] = p1;
1474         src[y*stride+0] = p2;
1475
1476         ad1= FFABS(d1)>>1;
1477
1478         d2= av_clip((p0-p3)/4, -ad1, ad1);
1479
1480         src[y*stride-2] = p0 - d2;
1481         src[y*stride+1] = p3 + d2;
1482     }
1483     }
1484 }
1485
1486 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1487 {
1488     int s, i;
1489
1490     s = 0;
1491     for(i=0;i<h;i++) {
1492         s += abs(pix1[0] - pix2[0]);
1493         s += abs(pix1[1] - pix2[1]);
1494         s += abs(pix1[2] - pix2[2]);
1495         s += abs(pix1[3] - pix2[3]);
1496         s += abs(pix1[4] - pix2[4]);
1497         s += abs(pix1[5] - pix2[5]);
1498         s += abs(pix1[6] - pix2[6]);
1499         s += abs(pix1[7] - pix2[7]);
1500         s += abs(pix1[8] - pix2[8]);
1501         s += abs(pix1[9] - pix2[9]);
1502         s += abs(pix1[10] - pix2[10]);
1503         s += abs(pix1[11] - pix2[11]);
1504         s += abs(pix1[12] - pix2[12]);
1505         s += abs(pix1[13] - pix2[13]);
1506         s += abs(pix1[14] - pix2[14]);
1507         s += abs(pix1[15] - pix2[15]);
1508         pix1 += line_size;
1509         pix2 += line_size;
1510     }
1511     return s;
1512 }
1513
1514 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1515 {
1516     int s, i;
1517
1518     s = 0;
1519     for(i=0;i<h;i++) {
1520         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1521         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1522         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1523         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1524         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1525         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1526         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1527         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1528         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1529         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1530         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1531         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1532         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1533         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1534         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1535         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1536         pix1 += line_size;
1537         pix2 += line_size;
1538     }
1539     return s;
1540 }
1541
1542 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1543 {
1544     int s, i;
1545     uint8_t *pix3 = pix2 + line_size;
1546
1547     s = 0;
1548     for(i=0;i<h;i++) {
1549         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1550         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1551         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1552         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1553         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1554         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1555         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1556         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1557         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1558         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1559         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1560         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1561         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1562         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1563         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1564         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1565         pix1 += line_size;
1566         pix2 += line_size;
1567         pix3 += line_size;
1568     }
1569     return s;
1570 }
1571
1572 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1573 {
1574     int s, i;
1575     uint8_t *pix3 = pix2 + line_size;
1576
1577     s = 0;
1578     for(i=0;i<h;i++) {
1579         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1580         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1581         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1582         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1583         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1584         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1585         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1586         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1587         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1588         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1589         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1590         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1591         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1592         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1593         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1594         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1595         pix1 += line_size;
1596         pix2 += line_size;
1597         pix3 += line_size;
1598     }
1599     return s;
1600 }
1601
1602 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1603 {
1604     int s, i;
1605
1606     s = 0;
1607     for(i=0;i<h;i++) {
1608         s += abs(pix1[0] - pix2[0]);
1609         s += abs(pix1[1] - pix2[1]);
1610         s += abs(pix1[2] - pix2[2]);
1611         s += abs(pix1[3] - pix2[3]);
1612         s += abs(pix1[4] - pix2[4]);
1613         s += abs(pix1[5] - pix2[5]);
1614         s += abs(pix1[6] - pix2[6]);
1615         s += abs(pix1[7] - pix2[7]);
1616         pix1 += line_size;
1617         pix2 += line_size;
1618     }
1619     return s;
1620 }
1621
1622 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1623 {
1624     int s, i;
1625
1626     s = 0;
1627     for(i=0;i<h;i++) {
1628         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1629         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1630         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1631         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1632         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1633         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1634         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1635         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1636         pix1 += line_size;
1637         pix2 += line_size;
1638     }
1639     return s;
1640 }
1641
1642 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1643 {
1644     int s, i;
1645     uint8_t *pix3 = pix2 + line_size;
1646
1647     s = 0;
1648     for(i=0;i<h;i++) {
1649         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1650         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1651         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1652         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1653         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1654         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1655         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1656         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1657         pix1 += line_size;
1658         pix2 += line_size;
1659         pix3 += line_size;
1660     }
1661     return s;
1662 }
1663
1664 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1665 {
1666     int s, i;
1667     uint8_t *pix3 = pix2 + line_size;
1668
1669     s = 0;
1670     for(i=0;i<h;i++) {
1671         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1672         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1673         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1674         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1675         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1676         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1677         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1678         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1679         pix1 += line_size;
1680         pix2 += line_size;
1681         pix3 += line_size;
1682     }
1683     return s;
1684 }
1685
1686 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1687     MpegEncContext *c = v;
1688     int score1=0;
1689     int score2=0;
1690     int x,y;
1691
1692     for(y=0; y<h; y++){
1693         for(x=0; x<16; x++){
1694             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1695         }
1696         if(y+1<h){
1697             for(x=0; x<15; x++){
1698                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1699                              - s1[x+1] + s1[x+1+stride])
1700                         -FFABS(  s2[x  ] - s2[x  +stride]
1701                              - s2[x+1] + s2[x+1+stride]);
1702             }
1703         }
1704         s1+= stride;
1705         s2+= stride;
1706     }
1707
1708     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1709     else  return score1 + FFABS(score2)*8;
1710 }
1711
1712 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1713     MpegEncContext *c = v;
1714     int score1=0;
1715     int score2=0;
1716     int x,y;
1717
1718     for(y=0; y<h; y++){
1719         for(x=0; x<8; x++){
1720             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1721         }
1722         if(y+1<h){
1723             for(x=0; x<7; x++){
1724                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1725                              - s1[x+1] + s1[x+1+stride])
1726                         -FFABS(  s2[x  ] - s2[x  +stride]
1727                              - s2[x+1] + s2[x+1+stride]);
1728             }
1729         }
1730         s1+= stride;
1731         s2+= stride;
1732     }
1733
1734     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1735     else  return score1 + FFABS(score2)*8;
1736 }
1737
1738 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1739     int i;
1740     unsigned int sum=0;
1741
1742     for(i=0; i<8*8; i++){
1743         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1744         int w= weight[i];
1745         b>>= RECON_SHIFT;
1746         assert(-512<b && b<512);
1747
1748         sum += (w*b)*(w*b)>>4;
1749     }
1750     return sum>>2;
1751 }
1752
1753 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1754     int i;
1755
1756     for(i=0; i<8*8; i++){
1757         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1758     }
1759 }
1760
1761 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1762     return 0;
1763 }
1764
1765 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1766     int i;
1767
1768     memset(cmp, 0, sizeof(void*)*6);
1769
1770     for(i=0; i<6; i++){
1771         switch(type&0xFF){
1772         case FF_CMP_SAD:
1773             cmp[i]= c->sad[i];
1774             break;
1775         case FF_CMP_SATD:
1776             cmp[i]= c->hadamard8_diff[i];
1777             break;
1778         case FF_CMP_SSE:
1779             cmp[i]= c->sse[i];
1780             break;
1781         case FF_CMP_DCT:
1782             cmp[i]= c->dct_sad[i];
1783             break;
1784         case FF_CMP_DCT264:
1785             cmp[i]= c->dct264_sad[i];
1786             break;
1787         case FF_CMP_DCTMAX:
1788             cmp[i]= c->dct_max[i];
1789             break;
1790         case FF_CMP_PSNR:
1791             cmp[i]= c->quant_psnr[i];
1792             break;
1793         case FF_CMP_BIT:
1794             cmp[i]= c->bit[i];
1795             break;
1796         case FF_CMP_RD:
1797             cmp[i]= c->rd[i];
1798             break;
1799         case FF_CMP_VSAD:
1800             cmp[i]= c->vsad[i];
1801             break;
1802         case FF_CMP_VSSE:
1803             cmp[i]= c->vsse[i];
1804             break;
1805         case FF_CMP_ZERO:
1806             cmp[i]= zero_cmp;
1807             break;
1808         case FF_CMP_NSSE:
1809             cmp[i]= c->nsse[i];
1810             break;
1811         default:
1812             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1813         }
1814     }
1815 }
1816
1817 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1818     long i;
1819     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1820         long a = *(long*)(src+i);
1821         long b = *(long*)(dst+i);
1822         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1823     }
1824     for(; i<w; i++)
1825         dst[i+0] += src[i+0];
1826 }
1827
1828 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1829     long i;
1830 #if !HAVE_FAST_UNALIGNED
1831     if((long)src2 & (sizeof(long)-1)){
1832         for(i=0; i+7<w; i+=8){
1833             dst[i+0] = src1[i+0]-src2[i+0];
1834             dst[i+1] = src1[i+1]-src2[i+1];
1835             dst[i+2] = src1[i+2]-src2[i+2];
1836             dst[i+3] = src1[i+3]-src2[i+3];
1837             dst[i+4] = src1[i+4]-src2[i+4];
1838             dst[i+5] = src1[i+5]-src2[i+5];
1839             dst[i+6] = src1[i+6]-src2[i+6];
1840             dst[i+7] = src1[i+7]-src2[i+7];
1841         }
1842     }else
1843 #endif
1844     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1845         long a = *(long*)(src1+i);
1846         long b = *(long*)(src2+i);
1847         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1848     }
1849     for(; i<w; i++)
1850         dst[i+0] = src1[i+0]-src2[i+0];
1851 }
1852
1853 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1854     int i;
1855     uint8_t l, lt;
1856
1857     l= *left;
1858     lt= *left_top;
1859
1860     for(i=0; i<w; i++){
1861         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1862         lt= src1[i];
1863         dst[i]= l;
1864     }
1865
1866     *left= l;
1867     *left_top= lt;
1868 }
1869
1870 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1871     int i;
1872     uint8_t l, lt;
1873
1874     l= *left;
1875     lt= *left_top;
1876
1877     for(i=0; i<w; i++){
1878         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1879         lt= src1[i];
1880         l= src2[i];
1881         dst[i]= l - pred;
1882     }
1883
1884     *left= l;
1885     *left_top= lt;
1886 }
1887
1888 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1889     int i;
1890
1891     for(i=0; i<w-1; i++){
1892         acc+= src[i];
1893         dst[i]= acc;
1894         i++;
1895         acc+= src[i];
1896         dst[i]= acc;
1897     }
1898
1899     for(; i<w; i++){
1900         acc+= src[i];
1901         dst[i]= acc;
1902     }
1903
1904     return acc;
1905 }
1906
1907 #if HAVE_BIGENDIAN
1908 #define B 3
1909 #define G 2
1910 #define R 1
1911 #define A 0
1912 #else
1913 #define B 0
1914 #define G 1
1915 #define R 2
1916 #define A 3
1917 #endif
1918 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1919     int i;
1920     int r,g,b,a;
1921     r= *red;
1922     g= *green;
1923     b= *blue;
1924     a= *alpha;
1925
1926     for(i=0; i<w; i++){
1927         b+= src[4*i+B];
1928         g+= src[4*i+G];
1929         r+= src[4*i+R];
1930         a+= src[4*i+A];
1931
1932         dst[4*i+B]= b;
1933         dst[4*i+G]= g;
1934         dst[4*i+R]= r;
1935         dst[4*i+A]= a;
1936     }
1937
1938     *red= r;
1939     *green= g;
1940     *blue= b;
1941     *alpha= a;
1942 }
1943 #undef B
1944 #undef G
1945 #undef R
1946 #undef A
1947
1948 #define BUTTERFLY2(o1,o2,i1,i2) \
1949 o1= (i1)+(i2);\
1950 o2= (i1)-(i2);
1951
1952 #define BUTTERFLY1(x,y) \
1953 {\
1954     int a,b;\
1955     a= x;\
1956     b= y;\
1957     x= a+b;\
1958     y= a-b;\
1959 }
1960
1961 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1962
1963 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1964     int i;
1965     int temp[64];
1966     int sum=0;
1967
1968     assert(h==8);
1969
1970     for(i=0; i<8; i++){
1971         //FIXME try pointer walks
1972         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1973         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1974         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1975         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1976
1977         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1978         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1979         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1980         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1981
1982         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1983         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1984         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1985         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1986     }
1987
1988     for(i=0; i<8; i++){
1989         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1990         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1991         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1992         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1993
1994         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1995         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1996         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1997         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1998
1999         sum +=
2000              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2001             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2002             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2003             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2004     }
2005     return sum;
2006 }
2007
2008 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2009     int i;
2010     int temp[64];
2011     int sum=0;
2012
2013     assert(h==8);
2014
2015     for(i=0; i<8; i++){
2016         //FIXME try pointer walks
2017         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2018         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2019         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2020         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2021
2022         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2023         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2024         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2025         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2026
2027         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2028         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2029         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2030         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2031     }
2032
2033     for(i=0; i<8; i++){
2034         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2035         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2036         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2037         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2038
2039         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2040         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2041         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2042         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2043
2044         sum +=
2045              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2046             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2047             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2048             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2049     }
2050
2051     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2052
2053     return sum;
2054 }
2055
2056 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2057     MpegEncContext * const s= (MpegEncContext *)c;
2058     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2059
2060     assert(h==8);
2061
2062     s->dsp.diff_pixels(temp, src1, src2, stride);
2063     s->dsp.fdct(temp);
2064     return s->dsp.sum_abs_dctelem(temp);
2065 }
2066
2067 #if CONFIG_GPL
2068 #define DCT8_1D {\
2069     const int s07 = SRC(0) + SRC(7);\
2070     const int s16 = SRC(1) + SRC(6);\
2071     const int s25 = SRC(2) + SRC(5);\
2072     const int s34 = SRC(3) + SRC(4);\
2073     const int a0 = s07 + s34;\
2074     const int a1 = s16 + s25;\
2075     const int a2 = s07 - s34;\
2076     const int a3 = s16 - s25;\
2077     const int d07 = SRC(0) - SRC(7);\
2078     const int d16 = SRC(1) - SRC(6);\
2079     const int d25 = SRC(2) - SRC(5);\
2080     const int d34 = SRC(3) - SRC(4);\
2081     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2082     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2083     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2084     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2085     DST(0,  a0 + a1     ) ;\
2086     DST(1,  a4 + (a7>>2)) ;\
2087     DST(2,  a2 + (a3>>1)) ;\
2088     DST(3,  a5 + (a6>>2)) ;\
2089     DST(4,  a0 - a1     ) ;\
2090     DST(5,  a6 - (a5>>2)) ;\
2091     DST(6, (a2>>1) - a3 ) ;\
2092     DST(7, (a4>>2) - a7 ) ;\
2093 }
2094
2095 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2096     MpegEncContext * const s= (MpegEncContext *)c;
2097     int16_t dct[8][8];
2098     int i;
2099     int sum=0;
2100
2101     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2102
2103 #define SRC(x) dct[i][x]
2104 #define DST(x,v) dct[i][x]= v
2105     for( i = 0; i < 8; i++ )
2106         DCT8_1D
2107 #undef SRC
2108 #undef DST
2109
2110 #define SRC(x) dct[x][i]
2111 #define DST(x,v) sum += FFABS(v)
2112     for( i = 0; i < 8; i++ )
2113         DCT8_1D
2114 #undef SRC
2115 #undef DST
2116     return sum;
2117 }
2118 #endif
2119
2120 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2121     MpegEncContext * const s= (MpegEncContext *)c;
2122     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2123     int sum=0, i;
2124
2125     assert(h==8);
2126
2127     s->dsp.diff_pixels(temp, src1, src2, stride);
2128     s->dsp.fdct(temp);
2129
2130     for(i=0; i<64; i++)
2131         sum= FFMAX(sum, FFABS(temp[i]));
2132
2133     return sum;
2134 }
2135
2136 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2137     MpegEncContext * const s= (MpegEncContext *)c;
2138     LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2139     int16_t * const bak = temp+64;
2140     int sum=0, i;
2141
2142     assert(h==8);
2143     s->mb_intra=0;
2144
2145     s->dsp.diff_pixels(temp, src1, src2, stride);
2146
2147     memcpy(bak, temp, 64*sizeof(int16_t));
2148
2149     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2150     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2151     ff_simple_idct_8(temp); //FIXME
2152
2153     for(i=0; i<64; i++)
2154         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2155
2156     return sum;
2157 }
2158
2159 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2160     MpegEncContext * const s= (MpegEncContext *)c;
2161     const uint8_t *scantable= s->intra_scantable.permutated;
2162     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2163     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2164     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2165     int i, last, run, bits, level, distortion, start_i;
2166     const int esc_length= s->ac_esc_length;
2167     uint8_t * length;
2168     uint8_t * last_length;
2169
2170     assert(h==8);
2171
2172     copy_block8(lsrc1, src1, 8, stride, 8);
2173     copy_block8(lsrc2, src2, 8, stride, 8);
2174
2175     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2176
2177     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2178
2179     bits=0;
2180
2181     if (s->mb_intra) {
2182         start_i = 1;
2183         length     = s->intra_ac_vlc_length;
2184         last_length= s->intra_ac_vlc_last_length;
2185         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2186     } else {
2187         start_i = 0;
2188         length     = s->inter_ac_vlc_length;
2189         last_length= s->inter_ac_vlc_last_length;
2190     }
2191
2192     if(last>=start_i){
2193         run=0;
2194         for(i=start_i; i<last; i++){
2195             int j= scantable[i];
2196             level= temp[j];
2197
2198             if(level){
2199                 level+=64;
2200                 if((level&(~127)) == 0){
2201                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2202                 }else
2203                     bits+= esc_length;
2204                 run=0;
2205             }else
2206                 run++;
2207         }
2208         i= scantable[last];
2209
2210         level= temp[i] + 64;
2211
2212         assert(level - 64);
2213
2214         if((level&(~127)) == 0){
2215             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2216         }else
2217             bits+= esc_length;
2218
2219     }
2220
2221     if(last>=0){
2222         if(s->mb_intra)
2223             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2224         else
2225             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2226     }
2227
2228     s->dsp.idct_add(lsrc2, 8, temp);
2229
2230     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2231
2232     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2233 }
2234
2235 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2236     MpegEncContext * const s= (MpegEncContext *)c;
2237     const uint8_t *scantable= s->intra_scantable.permutated;
2238     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2239     int i, last, run, bits, level, start_i;
2240     const int esc_length= s->ac_esc_length;
2241     uint8_t * length;
2242     uint8_t * last_length;
2243
2244     assert(h==8);
2245
2246     s->dsp.diff_pixels(temp, src1, src2, stride);
2247
2248     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2249
2250     bits=0;
2251
2252     if (s->mb_intra) {
2253         start_i = 1;
2254         length     = s->intra_ac_vlc_length;
2255         last_length= s->intra_ac_vlc_last_length;
2256         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2257     } else {
2258         start_i = 0;
2259         length     = s->inter_ac_vlc_length;
2260         last_length= s->inter_ac_vlc_last_length;
2261     }
2262
2263     if(last>=start_i){
2264         run=0;
2265         for(i=start_i; i<last; i++){
2266             int j= scantable[i];
2267             level= temp[j];
2268
2269             if(level){
2270                 level+=64;
2271                 if((level&(~127)) == 0){
2272                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2273                 }else
2274                     bits+= esc_length;
2275                 run=0;
2276             }else
2277                 run++;
2278         }
2279         i= scantable[last];
2280
2281         level= temp[i] + 64;
2282
2283         assert(level - 64);
2284
2285         if((level&(~127)) == 0){
2286             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2287         }else
2288             bits+= esc_length;
2289     }
2290
2291     return bits;
2292 }
2293
2294 #define VSAD_INTRA(size) \
2295 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2296     int score=0;                                                                                            \
2297     int x,y;                                                                                                \
2298                                                                                                             \
2299     for(y=1; y<h; y++){                                                                                     \
2300         for(x=0; x<size; x+=4){                                                                             \
2301             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2302                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2303         }                                                                                                   \
2304         s+= stride;                                                                                         \
2305     }                                                                                                       \
2306                                                                                                             \
2307     return score;                                                                                           \
2308 }
2309 VSAD_INTRA(8)
2310 VSAD_INTRA(16)
2311
2312 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2313     int score=0;
2314     int x,y;
2315
2316     for(y=1; y<h; y++){
2317         for(x=0; x<16; x++){
2318             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2319         }
2320         s1+= stride;
2321         s2+= stride;
2322     }
2323
2324     return score;
2325 }
2326
2327 #define SQ(a) ((a)*(a))
2328 #define VSSE_INTRA(size) \
2329 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2330     int score=0;                                                                                            \
2331     int x,y;                                                                                                \
2332                                                                                                             \
2333     for(y=1; y<h; y++){                                                                                     \
2334         for(x=0; x<size; x+=4){                                                                               \
2335             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2336                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2337         }                                                                                                   \
2338         s+= stride;                                                                                         \
2339     }                                                                                                       \
2340                                                                                                             \
2341     return score;                                                                                           \
2342 }
2343 VSSE_INTRA(8)
2344 VSSE_INTRA(16)
2345
2346 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2347     int score=0;
2348     int x,y;
2349
2350     for(y=1; y<h; y++){
2351         for(x=0; x<16; x++){
2352             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2353         }
2354         s1+= stride;
2355         s2+= stride;
2356     }
2357
2358     return score;
2359 }
2360
2361 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2362                                int size){
2363     int score=0;
2364     int i;
2365     for(i=0; i<size; i++)
2366         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2367     return score;
2368 }
2369
2370 #define WRAPPER8_16_SQ(name8, name16)\
2371 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2372     int score=0;\
2373     score +=name8(s, dst           , src           , stride, 8);\
2374     score +=name8(s, dst+8         , src+8         , stride, 8);\
2375     if(h==16){\
2376         dst += 8*stride;\
2377         src += 8*stride;\
2378         score +=name8(s, dst           , src           , stride, 8);\
2379         score +=name8(s, dst+8         , src+8         , stride, 8);\
2380     }\
2381     return score;\
2382 }
2383
2384 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2385 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2386 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2387 #if CONFIG_GPL
2388 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2389 #endif
2390 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2391 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2392 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2393 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2394
2395 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2396                    uint32_t maxi, uint32_t maxisign)
2397 {
2398
2399     if(a > mini) return mini;
2400     else if((a^(1U<<31)) > maxisign) return maxi;
2401     else return a;
2402 }
2403
2404 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2405     int i;
2406     uint32_t mini = *(uint32_t*)min;
2407     uint32_t maxi = *(uint32_t*)max;
2408     uint32_t maxisign = maxi ^ (1U<<31);
2409     uint32_t *dsti = (uint32_t*)dst;
2410     const uint32_t *srci = (const uint32_t*)src;
2411     for(i=0; i<len; i+=8) {
2412         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2413         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2414         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2415         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2416         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2417         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2418         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2419         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2420     }
2421 }
2422 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2423     int i;
2424     if(min < 0 && max > 0) {
2425         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2426     } else {
2427         for(i=0; i < len; i+=8) {
2428             dst[i    ] = av_clipf(src[i    ], min, max);
2429             dst[i + 1] = av_clipf(src[i + 1], min, max);
2430             dst[i + 2] = av_clipf(src[i + 2], min, max);
2431             dst[i + 3] = av_clipf(src[i + 3], min, max);
2432             dst[i + 4] = av_clipf(src[i + 4], min, max);
2433             dst[i + 5] = av_clipf(src[i + 5], min, max);
2434             dst[i + 6] = av_clipf(src[i + 6], min, max);
2435             dst[i + 7] = av_clipf(src[i + 7], min, max);
2436         }
2437     }
2438 }
2439
2440 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2441 {
2442     int res = 0;
2443
2444     while (order--)
2445         res += *v1++ * *v2++;
2446
2447     return res;
2448 }
2449
2450 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2451 {
2452     int res = 0;
2453     while (order--) {
2454         res   += *v1 * *v2++;
2455         *v1++ += mul * *v3++;
2456     }
2457     return res;
2458 }
2459
2460 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2461                                  const int16_t *window, unsigned int len)
2462 {
2463     int i;
2464     int len2 = len >> 1;
2465
2466     for (i = 0; i < len2; i++) {
2467         int16_t w       = window[i];
2468         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2469         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2470     }
2471 }
2472
2473 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2474                                 int32_t max, unsigned int len)
2475 {
2476     do {
2477         *dst++ = av_clip(*src++, min, max);
2478         *dst++ = av_clip(*src++, min, max);
2479         *dst++ = av_clip(*src++, min, max);
2480         *dst++ = av_clip(*src++, min, max);
2481         *dst++ = av_clip(*src++, min, max);
2482         *dst++ = av_clip(*src++, min, max);
2483         *dst++ = av_clip(*src++, min, max);
2484         *dst++ = av_clip(*src++, min, max);
2485         len -= 8;
2486     } while (len > 0);
2487 }
2488
2489 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2490 {
2491     ff_j_rev_dct (block);
2492     put_pixels_clamped_c(block, dest, line_size);
2493 }
2494 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2495 {
2496     ff_j_rev_dct (block);
2497     add_pixels_clamped_c(block, dest, line_size);
2498 }
2499
2500 /* init static data */
2501 av_cold void ff_dsputil_static_init(void)
2502 {
2503     int i;
2504
2505     for(i=0;i<512;i++) {
2506         ff_squareTbl[i] = (i - 256) * (i - 256);
2507     }
2508
2509     for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2510 }
2511
2512 int ff_check_alignment(void){
2513     static int did_fail=0;
2514     LOCAL_ALIGNED_16(int, aligned, [4]);
2515
2516     if((intptr_t)aligned & 15){
2517         if(!did_fail){
2518 #if HAVE_MMX || HAVE_ALTIVEC
2519             av_log(NULL, AV_LOG_ERROR,
2520                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2521                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2522                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2523                 "Do not report crashes to Libav developers.\n");
2524 #endif
2525             did_fail=1;
2526         }
2527         return -1;
2528     }
2529     return 0;
2530 }
2531
2532 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2533 {
2534     ff_check_alignment();
2535
2536 #if CONFIG_ENCODERS
2537     if (avctx->bits_per_raw_sample == 10) {
2538         c->fdct    = ff_jpeg_fdct_islow_10;
2539         c->fdct248 = ff_fdct248_islow_10;
2540     } else {
2541         if(avctx->dct_algo==FF_DCT_FASTINT) {
2542             c->fdct    = ff_fdct_ifast;
2543             c->fdct248 = ff_fdct_ifast248;
2544         }
2545         else if(avctx->dct_algo==FF_DCT_FAAN) {
2546             c->fdct    = ff_faandct;
2547             c->fdct248 = ff_faandct248;
2548         }
2549         else {
2550             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2551             c->fdct248 = ff_fdct248_islow_8;
2552         }
2553     }
2554 #endif //CONFIG_ENCODERS
2555
2556     if (avctx->bits_per_raw_sample == 10) {
2557         c->idct_put              = ff_simple_idct_put_10;
2558         c->idct_add              = ff_simple_idct_add_10;
2559         c->idct                  = ff_simple_idct_10;
2560         c->idct_permutation_type = FF_NO_IDCT_PERM;
2561     } else {
2562         if(avctx->idct_algo==FF_IDCT_INT){
2563             c->idct_put= jref_idct_put;
2564             c->idct_add= jref_idct_add;
2565             c->idct    = ff_j_rev_dct;
2566             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2567         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2568             c->idct_put= ff_faanidct_put;
2569             c->idct_add= ff_faanidct_add;
2570             c->idct    = ff_faanidct;
2571             c->idct_permutation_type= FF_NO_IDCT_PERM;
2572         }else{ //accurate/default
2573             c->idct_put = ff_simple_idct_put_8;
2574             c->idct_add = ff_simple_idct_add_8;
2575             c->idct     = ff_simple_idct_8;
2576             c->idct_permutation_type= FF_NO_IDCT_PERM;
2577         }
2578     }
2579
2580     c->diff_pixels = diff_pixels_c;
2581     c->put_pixels_clamped = put_pixels_clamped_c;
2582     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2583     c->add_pixels_clamped = add_pixels_clamped_c;
2584     c->sum_abs_dctelem = sum_abs_dctelem_c;
2585     c->gmc1 = gmc1_c;
2586     c->gmc = ff_gmc_c;
2587     c->pix_sum = pix_sum_c;
2588     c->pix_norm1 = pix_norm1_c;
2589
2590     c->fill_block_tab[0] = fill_block16_c;
2591     c->fill_block_tab[1] = fill_block8_c;
2592
2593     /* TODO [0] 16  [1] 8 */
2594     c->pix_abs[0][0] = pix_abs16_c;
2595     c->pix_abs[0][1] = pix_abs16_x2_c;
2596     c->pix_abs[0][2] = pix_abs16_y2_c;
2597     c->pix_abs[0][3] = pix_abs16_xy2_c;
2598     c->pix_abs[1][0] = pix_abs8_c;
2599     c->pix_abs[1][1] = pix_abs8_x2_c;
2600     c->pix_abs[1][2] = pix_abs8_y2_c;
2601     c->pix_abs[1][3] = pix_abs8_xy2_c;
2602
2603     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2604     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2605     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2606     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2607     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2608     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2609     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2610     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2611     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2612
2613     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2614     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2615     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2616     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2617     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2618     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2619     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2620     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2621     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2622
2623 #define dspfunc(PFX, IDX, NUM) \
2624     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2625     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2626     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2627     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2628     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2629     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2630     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2631     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2632     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2633     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2634     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2635     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2636     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2637     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2638     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2639     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2640
2641     dspfunc(put_qpel, 0, 16);
2642     dspfunc(put_no_rnd_qpel, 0, 16);
2643
2644     dspfunc(avg_qpel, 0, 16);
2645     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2646
2647     dspfunc(put_qpel, 1, 8);
2648     dspfunc(put_no_rnd_qpel, 1, 8);
2649
2650     dspfunc(avg_qpel, 1, 8);
2651     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2652
2653 #undef dspfunc
2654
2655     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2656     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2657     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2658     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2659     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2660     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2661     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2662     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2663
2664 #define SET_CMP_FUNC(name) \
2665     c->name[0]= name ## 16_c;\
2666     c->name[1]= name ## 8x8_c;
2667
2668     SET_CMP_FUNC(hadamard8_diff)
2669     c->hadamard8_diff[4]= hadamard8_intra16_c;
2670     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2671     SET_CMP_FUNC(dct_sad)
2672     SET_CMP_FUNC(dct_max)
2673 #if CONFIG_GPL
2674     SET_CMP_FUNC(dct264_sad)
2675 #endif
2676     c->sad[0]= pix_abs16_c;
2677     c->sad[1]= pix_abs8_c;
2678     c->sse[0]= sse16_c;
2679     c->sse[1]= sse8_c;
2680     c->sse[2]= sse4_c;
2681     SET_CMP_FUNC(quant_psnr)
2682     SET_CMP_FUNC(rd)
2683     SET_CMP_FUNC(bit)
2684     c->vsad[0]= vsad16_c;
2685     c->vsad[4]= vsad_intra16_c;
2686     c->vsad[5]= vsad_intra8_c;
2687     c->vsse[0]= vsse16_c;
2688     c->vsse[4]= vsse_intra16_c;
2689     c->vsse[5]= vsse_intra8_c;
2690     c->nsse[0]= nsse16_c;
2691     c->nsse[1]= nsse8_c;
2692
2693     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2694
2695     c->add_bytes= add_bytes_c;
2696     c->diff_bytes= diff_bytes_c;
2697     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2698     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2699     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
2700     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2701     c->bswap_buf= bswap_buf;
2702     c->bswap16_buf = bswap16_buf;
2703
2704     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2705         c->h263_h_loop_filter= h263_h_loop_filter_c;
2706         c->h263_v_loop_filter= h263_v_loop_filter_c;
2707     }
2708
2709     c->try_8x8basis= try_8x8basis_c;
2710     c->add_8x8basis= add_8x8basis_c;
2711
2712     c->vector_clipf = vector_clipf_c;
2713     c->scalarproduct_int16 = scalarproduct_int16_c;
2714     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2715     c->apply_window_int16 = apply_window_int16_c;
2716     c->vector_clip_int32 = vector_clip_int32_c;
2717
2718     c->shrink[0]= av_image_copy_plane;
2719     c->shrink[1]= ff_shrink22;
2720     c->shrink[2]= ff_shrink44;
2721     c->shrink[3]= ff_shrink88;
2722
2723     c->add_pixels8 = add_pixels8_c;
2724
2725 #undef FUNC
2726 #undef FUNCC
2727 #define FUNC(f, depth) f ## _ ## depth
2728 #define FUNCC(f, depth) f ## _ ## depth ## _c
2729
2730     c->draw_edges                    = FUNCC(draw_edges, 8);
2731     c->clear_block                   = FUNCC(clear_block, 8);
2732     c->clear_blocks                  = FUNCC(clear_blocks, 8);
2733
2734 #define BIT_DEPTH_FUNCS(depth) \
2735     c->get_pixels                    = FUNCC(get_pixels,   depth);
2736
2737     switch (avctx->bits_per_raw_sample) {
2738     case 9:
2739     case 10:
2740         BIT_DEPTH_FUNCS(16);
2741         break;
2742     default:
2743         BIT_DEPTH_FUNCS(8);
2744         break;
2745     }
2746
2747
2748     if (HAVE_MMX)        ff_dsputil_init_mmx   (c, avctx);
2749     if (ARCH_ARM)        ff_dsputil_init_arm   (c, avctx);
2750     if (HAVE_VIS)        ff_dsputil_init_vis   (c, avctx);
2751     if (ARCH_ALPHA)      ff_dsputil_init_alpha (c, avctx);
2752     if (ARCH_PPC)        ff_dsputil_init_ppc   (c, avctx);
2753     if (ARCH_SH4)        ff_dsputil_init_sh4   (c, avctx);
2754     if (ARCH_BFIN)       ff_dsputil_init_bfin  (c, avctx);
2755
2756     ff_init_scantable_permutation(c->idct_permutation,
2757                                   c->idct_permutation_type);
2758 }