git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "libavutil/internal.h"
  32 #include "avcodec.h"
  33 #include "copy_block.h"
  34 #include "dsputil.h"
  35 #include "simple_idct.h"
  36 #include "faandct.h"
  37 #include "faanidct.h"
  38 #include "imgconvert.h"
  39 #include "mathops.h"
  40 #include "mpegvideo.h"
  41 #include "config.h"
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 #define BIT_DEPTH 9
  47 #include "dsputil_template.c"
  48 #undef BIT_DEPTH
  49
  50 #define BIT_DEPTH 10
  51 #include "dsputil_template.c"
  52 #undef BIT_DEPTH
  53
  54 #define BIT_DEPTH 8
  55 #include "dsputil_template.c"
  56
  57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  58 #define pb_7f (~0UL/255 * 0x7f)
  59 #define pb_80 (~0UL/255 * 0x80)
  60
  61 const uint8_t ff_zigzag_direct[64] = {
  62     0,   1,  8, 16,  9,  2,  3, 10,
  63     17, 24, 32, 25, 18, 11,  4,  5,
  64     12, 19, 26, 33, 40, 48, 41, 34,
  65     27, 20, 13,  6,  7, 14, 21, 28,
  66     35, 42, 49, 56, 57, 50, 43, 36,
  67     29, 22, 15, 23, 30, 37, 44, 51,
  68     58, 59, 52, 45, 38, 31, 39, 46,
  69     53, 60, 61, 54, 47, 55, 62, 63
  70 };
  71
  72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  73    specification, we interleave the fields */
  74 const uint8_t ff_zigzag248_direct[64] = {
  75      0,  8,  1,  9, 16, 24,  2, 10,
  76     17, 25, 32, 40, 48, 56, 33, 41,
  77     18, 26,  3, 11,  4, 12, 19, 27,
  78     34, 42, 49, 57, 50, 58, 35, 43,
  79     20, 28,  5, 13,  6, 14, 21, 29,
  80     36, 44, 51, 59, 52, 60, 37, 45,
  81     22, 30,  7, 15, 23, 31, 38, 46,
  82     53, 61, 54, 62, 39, 47, 55, 63,
  83 };
  84
  85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  86 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
  87
  88 const uint8_t ff_alternate_horizontal_scan[64] = {
  89     0,  1,   2,  3,  8,  9, 16, 17,
  90     10, 11,  4,  5,  6,  7, 15, 14,
  91     13, 12, 19, 18, 24, 25, 32, 33,
  92     26, 27, 20, 21, 22, 23, 28, 29,
  93     30, 31, 34, 35, 40, 41, 48, 49,
  94     42, 43, 36, 37, 38, 39, 44, 45,
  95     46, 47, 50, 51, 56, 57, 58, 59,
  96     52, 53, 54, 55, 60, 61, 62, 63,
  97 };
  98
  99 const uint8_t ff_alternate_vertical_scan[64] = {
 100     0,  8,  16, 24,  1,  9,  2, 10,
 101     17, 25, 32, 40, 48, 56, 57, 49,
 102     41, 33, 26, 18,  3, 11,  4, 12,
 103     19, 27, 34, 42, 50, 58, 35, 43,
 104     51, 59, 20, 28,  5, 13,  6, 14,
 105     21, 29, 36, 44, 52, 60, 37, 45,
 106     53, 61, 22, 30,  7, 15, 23, 31,
 107     38, 46, 54, 62, 39, 47, 55, 63,
 108 };
 109
 110 /* Input permutation for the simple_idct_mmx */
 111 static const uint8_t simple_mmx_permutation[64]={
 112         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 113         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 114         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 115         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 116         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 117         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 118         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 119         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 120 };
 121
 122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 123
 124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 125     int i;
 126     int end;
 127
 128     st->scantable= src_scantable;
 129
 130     for(i=0; i<64; i++){
 131         int j;
 132         j = src_scantable[i];
 133         st->permutated[i] = permutation[j];
 134     }
 135
 136     end=-1;
 137     for(i=0; i<64; i++){
 138         int j;
 139         j = st->permutated[i];
 140         if(j>end) end=j;
 141         st->raster_end[i]= end;
 142     }
 143 }
 144
 145 void ff_init_scantable_permutation(uint8_t *idct_permutation,
 146                                    int idct_permutation_type)
 147 {
 148     int i;
 149
 150     switch(idct_permutation_type){
 151     case FF_NO_IDCT_PERM:
 152         for(i=0; i<64; i++)
 153             idct_permutation[i]= i;
 154         break;
 155     case FF_LIBMPEG2_IDCT_PERM:
 156         for(i=0; i<64; i++)
 157             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 158         break;
 159     case FF_SIMPLE_IDCT_PERM:
 160         for(i=0; i<64; i++)
 161             idct_permutation[i]= simple_mmx_permutation[i];
 162         break;
 163     case FF_TRANSPOSE_IDCT_PERM:
 164         for(i=0; i<64; i++)
 165             idct_permutation[i]= ((i&7)<<3) | (i>>3);
 166         break;
 167     case FF_PARTTRANS_IDCT_PERM:
 168         for(i=0; i<64; i++)
 169             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
 170         break;
 171     case FF_SSE2_IDCT_PERM:
 172         for(i=0; i<64; i++)
 173             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
 174         break;
 175     default:
 176         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
 177     }
 178 }
 179
 180 static int pix_sum_c(uint8_t * pix, int line_size)
 181 {
 182     int s, i, j;
 183
 184     s = 0;
 185     for (i = 0; i < 16; i++) {
 186         for (j = 0; j < 16; j += 8) {
 187             s += pix[0];
 188             s += pix[1];
 189             s += pix[2];
 190             s += pix[3];
 191             s += pix[4];
 192             s += pix[5];
 193             s += pix[6];
 194             s += pix[7];
 195             pix += 8;
 196         }
 197         pix += line_size - 16;
 198     }
 199     return s;
 200 }
 201
 202 static int pix_norm1_c(uint8_t * pix, int line_size)
 203 {
 204     int s, i, j;
 205     uint32_t *sq = ff_squareTbl + 256;
 206
 207     s = 0;
 208     for (i = 0; i < 16; i++) {
 209         for (j = 0; j < 16; j += 8) {
 210 #if 0
 211             s += sq[pix[0]];
 212             s += sq[pix[1]];
 213             s += sq[pix[2]];
 214             s += sq[pix[3]];
 215             s += sq[pix[4]];
 216             s += sq[pix[5]];
 217             s += sq[pix[6]];
 218             s += sq[pix[7]];
 219 #else
 220 #if HAVE_FAST_64BIT
 221             register uint64_t x=*(uint64_t*)pix;
 222             s += sq[x&0xff];
 223             s += sq[(x>>8)&0xff];
 224             s += sq[(x>>16)&0xff];
 225             s += sq[(x>>24)&0xff];
 226             s += sq[(x>>32)&0xff];
 227             s += sq[(x>>40)&0xff];
 228             s += sq[(x>>48)&0xff];
 229             s += sq[(x>>56)&0xff];
 230 #else
 231             register uint32_t x=*(uint32_t*)pix;
 232             s += sq[x&0xff];
 233             s += sq[(x>>8)&0xff];
 234             s += sq[(x>>16)&0xff];
 235             s += sq[(x>>24)&0xff];
 236             x=*(uint32_t*)(pix+4);
 237             s += sq[x&0xff];
 238             s += sq[(x>>8)&0xff];
 239             s += sq[(x>>16)&0xff];
 240             s += sq[(x>>24)&0xff];
 241 #endif
 242 #endif
 243             pix += 8;
 244         }
 245         pix += line_size - 16;
 246     }
 247     return s;
 248 }
 249
 250 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 251     int i;
 252
 253     for(i=0; i+8<=w; i+=8){
 254         dst[i+0]= av_bswap32(src[i+0]);
 255         dst[i+1]= av_bswap32(src[i+1]);
 256         dst[i+2]= av_bswap32(src[i+2]);
 257         dst[i+3]= av_bswap32(src[i+3]);
 258         dst[i+4]= av_bswap32(src[i+4]);
 259         dst[i+5]= av_bswap32(src[i+5]);
 260         dst[i+6]= av_bswap32(src[i+6]);
 261         dst[i+7]= av_bswap32(src[i+7]);
 262     }
 263     for(;i<w; i++){
 264         dst[i+0]= av_bswap32(src[i+0]);
 265     }
 266 }
 267
 268 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 269 {
 270     while (len--)
 271         *dst++ = av_bswap16(*src++);
 272 }
 273
 274 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 275 {
 276     int s, i;
 277     uint32_t *sq = ff_squareTbl + 256;
 278
 279     s = 0;
 280     for (i = 0; i < h; i++) {
 281         s += sq[pix1[0] - pix2[0]];
 282         s += sq[pix1[1] - pix2[1]];
 283         s += sq[pix1[2] - pix2[2]];
 284         s += sq[pix1[3] - pix2[3]];
 285         pix1 += line_size;
 286         pix2 += line_size;
 287     }
 288     return s;
 289 }
 290
 291 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 292 {
 293     int s, i;
 294     uint32_t *sq = ff_squareTbl + 256;
 295
 296     s = 0;
 297     for (i = 0; i < h; i++) {
 298         s += sq[pix1[0] - pix2[0]];
 299         s += sq[pix1[1] - pix2[1]];
 300         s += sq[pix1[2] - pix2[2]];
 301         s += sq[pix1[3] - pix2[3]];
 302         s += sq[pix1[4] - pix2[4]];
 303         s += sq[pix1[5] - pix2[5]];
 304         s += sq[pix1[6] - pix2[6]];
 305         s += sq[pix1[7] - pix2[7]];
 306         pix1 += line_size;
 307         pix2 += line_size;
 308     }
 309     return s;
 310 }
 311
 312 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 313 {
 314     int s, i;
 315     uint32_t *sq = ff_squareTbl + 256;
 316
 317     s = 0;
 318     for (i = 0; i < h; i++) {
 319         s += sq[pix1[ 0] - pix2[ 0]];
 320         s += sq[pix1[ 1] - pix2[ 1]];
 321         s += sq[pix1[ 2] - pix2[ 2]];
 322         s += sq[pix1[ 3] - pix2[ 3]];
 323         s += sq[pix1[ 4] - pix2[ 4]];
 324         s += sq[pix1[ 5] - pix2[ 5]];
 325         s += sq[pix1[ 6] - pix2[ 6]];
 326         s += sq[pix1[ 7] - pix2[ 7]];
 327         s += sq[pix1[ 8] - pix2[ 8]];
 328         s += sq[pix1[ 9] - pix2[ 9]];
 329         s += sq[pix1[10] - pix2[10]];
 330         s += sq[pix1[11] - pix2[11]];
 331         s += sq[pix1[12] - pix2[12]];
 332         s += sq[pix1[13] - pix2[13]];
 333         s += sq[pix1[14] - pix2[14]];
 334         s += sq[pix1[15] - pix2[15]];
 335
 336         pix1 += line_size;
 337         pix2 += line_size;
 338     }
 339     return s;
 340 }
 341
 342 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
 343                           const uint8_t *s2, int stride){
 344     int i;
 345
 346     /* read the pixels */
 347     for(i=0;i<8;i++) {
 348         block[0] = s1[0] - s2[0];
 349         block[1] = s1[1] - s2[1];
 350         block[2] = s1[2] - s2[2];
 351         block[3] = s1[3] - s2[3];
 352         block[4] = s1[4] - s2[4];
 353         block[5] = s1[5] - s2[5];
 354         block[6] = s1[6] - s2[6];
 355         block[7] = s1[7] - s2[7];
 356         s1 += stride;
 357         s2 += stride;
 358         block += 8;
 359     }
 360 }
 361
 362
 363 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 364                                  int line_size)
 365 {
 366     int i;
 367
 368     /* read the pixels */
 369     for(i=0;i<8;i++) {
 370         pixels[0] = av_clip_uint8(block[0]);
 371         pixels[1] = av_clip_uint8(block[1]);
 372         pixels[2] = av_clip_uint8(block[2]);
 373         pixels[3] = av_clip_uint8(block[3]);
 374         pixels[4] = av_clip_uint8(block[4]);
 375         pixels[5] = av_clip_uint8(block[5]);
 376         pixels[6] = av_clip_uint8(block[6]);
 377         pixels[7] = av_clip_uint8(block[7]);
 378
 379         pixels += line_size;
 380         block += 8;
 381     }
 382 }
 383
 384 static void put_signed_pixels_clamped_c(const int16_t *block,
 385                                         uint8_t *restrict pixels,
 386                                         int line_size)
 387 {
 388     int i, j;
 389
 390     for (i = 0; i < 8; i++) {
 391         for (j = 0; j < 8; j++) {
 392             if (*block < -128)
 393                 *pixels = 0;
 394             else if (*block > 127)
 395                 *pixels = 255;
 396             else
 397                 *pixels = (uint8_t)(*block + 128);
 398             block++;
 399             pixels++;
 400         }
 401         pixels += (line_size - 8);
 402     }
 403 }
 404
 405 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 406                                  int line_size)
 407 {
 408     int i;
 409
 410     /* read the pixels */
 411     for(i=0;i<8;i++) {
 412         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 413         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 414         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 415         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 416         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
 417         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
 418         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
 419         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
 420         pixels += line_size;
 421         block += 8;
 422     }
 423 }
 424
 425 static int sum_abs_dctelem_c(int16_t *block)
 426 {
 427     int sum=0, i;
 428     for(i=0; i<64; i++)
 429         sum+= FFABS(block[i]);
 430     return sum;
 431 }
 432
 433 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 434 {
 435     int i;
 436
 437     for (i = 0; i < h; i++) {
 438         memset(block, value, 16);
 439         block += line_size;
 440     }
 441 }
 442
 443 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 444 {
 445     int i;
 446
 447     for (i = 0; i < h; i++) {
 448         memset(block, value, 8);
 449         block += line_size;
 450     }
 451 }
 452
 453 #define avg2(a,b) ((a+b+1)>>1)
 454 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 455
 456 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 457 {
 458     const int A=(16-x16)*(16-y16);
 459     const int B=(   x16)*(16-y16);
 460     const int C=(16-x16)*(   y16);
 461     const int D=(   x16)*(   y16);
 462     int i;
 463
 464     for(i=0; i<h; i++)
 465     {
 466         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 467         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 468         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 469         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 470         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 471         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 472         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 473         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 474         dst+= stride;
 475         src+= stride;
 476     }
 477 }
 478
 479 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 480                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 481 {
 482     int y, vx, vy;
 483     const int s= 1<<shift;
 484
 485     width--;
 486     height--;
 487
 488     for(y=0; y<h; y++){
 489         int x;
 490
 491         vx= ox;
 492         vy= oy;
 493         for(x=0; x<8; x++){ //XXX FIXME optimize
 494             int src_x, src_y, frac_x, frac_y, index;
 495
 496             src_x= vx>>16;
 497             src_y= vy>>16;
 498             frac_x= src_x&(s-1);
 499             frac_y= src_y&(s-1);
 500             src_x>>=shift;
 501             src_y>>=shift;
 502
 503             if((unsigned)src_x < width){
 504                 if((unsigned)src_y < height){
 505                     index= src_x + src_y*stride;
 506                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 507                                            + src[index       +1]*   frac_x )*(s-frac_y)
 508                                         + (  src[index+stride  ]*(s-frac_x)
 509                                            + src[index+stride+1]*   frac_x )*   frac_y
 510                                         + r)>>(shift*2);
 511                 }else{
 512                     index= src_x + av_clip(src_y, 0, height)*stride;
 513                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 514                                           + src[index       +1]*   frac_x )*s
 515                                         + r)>>(shift*2);
 516                 }
 517             }else{
 518                 if((unsigned)src_y < height){
 519                     index= av_clip(src_x, 0, width) + src_y*stride;
 520                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 521                                            + src[index+stride  ]*   frac_y )*s
 522                                         + r)>>(shift*2);
 523                 }else{
 524                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 525                     dst[y*stride + x]=    src[index         ];
 526                 }
 527             }
 528
 529             vx+= dxx;
 530             vy+= dyx;
 531         }
 532         ox += dxy;
 533         oy += dyy;
 534     }
 535 }
 536
 537 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 538     switch(width){
 539     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 540     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 541     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 542     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 543     }
 544 }
 545
 546 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 547     int i,j;
 548     for (i=0; i < height; i++) {
 549       for (j=0; j < width; j++) {
 550         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 551       }
 552       src += stride;
 553       dst += stride;
 554     }
 555 }
 556
 557 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 558     int i,j;
 559     for (i=0; i < height; i++) {
 560       for (j=0; j < width; j++) {
 561         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 562       }
 563       src += stride;
 564       dst += stride;
 565     }
 566 }
 567
 568 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 569     int i,j;
 570     for (i=0; i < height; i++) {
 571       for (j=0; j < width; j++) {
 572         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 573       }
 574       src += stride;
 575       dst += stride;
 576     }
 577 }
 578
 579 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 580     int i,j;
 581     for (i=0; i < height; i++) {
 582       for (j=0; j < width; j++) {
 583         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 584       }
 585       src += stride;
 586       dst += stride;
 587     }
 588 }
 589
 590 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 591     int i,j;
 592     for (i=0; i < height; i++) {
 593       for (j=0; j < width; j++) {
 594         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 595       }
 596       src += stride;
 597       dst += stride;
 598     }
 599 }
 600
 601 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 602     int i,j;
 603     for (i=0; i < height; i++) {
 604       for (j=0; j < width; j++) {
 605         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 606       }
 607       src += stride;
 608       dst += stride;
 609     }
 610 }
 611
 612 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 613     int i,j;
 614     for (i=0; i < height; i++) {
 615       for (j=0; j < width; j++) {
 616         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 617       }
 618       src += stride;
 619       dst += stride;
 620     }
 621 }
 622
 623 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 624     int i,j;
 625     for (i=0; i < height; i++) {
 626       for (j=0; j < width; j++) {
 627         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 628       }
 629       src += stride;
 630       dst += stride;
 631     }
 632 }
 633
 634 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 635     switch(width){
 636     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 637     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 638     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 639     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 640     }
 641 }
 642
 643 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 644     int i,j;
 645     for (i=0; i < height; i++) {
 646       for (j=0; j < width; j++) {
 647         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 648       }
 649       src += stride;
 650       dst += stride;
 651     }
 652 }
 653
 654 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 655     int i,j;
 656     for (i=0; i < height; i++) {
 657       for (j=0; j < width; j++) {
 658         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 659       }
 660       src += stride;
 661       dst += stride;
 662     }
 663 }
 664
 665 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 666     int i,j;
 667     for (i=0; i < height; i++) {
 668       for (j=0; j < width; j++) {
 669         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 670       }
 671       src += stride;
 672       dst += stride;
 673     }
 674 }
 675
 676 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 677     int i,j;
 678     for (i=0; i < height; i++) {
 679       for (j=0; j < width; j++) {
 680         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 681       }
 682       src += stride;
 683       dst += stride;
 684     }
 685 }
 686
 687 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 688     int i,j;
 689     for (i=0; i < height; i++) {
 690       for (j=0; j < width; j++) {
 691         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 692       }
 693       src += stride;
 694       dst += stride;
 695     }
 696 }
 697
 698 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 699     int i,j;
 700     for (i=0; i < height; i++) {
 701       for (j=0; j < width; j++) {
 702         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 703       }
 704       src += stride;
 705       dst += stride;
 706     }
 707 }
 708
 709 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 710     int i,j;
 711     for (i=0; i < height; i++) {
 712       for (j=0; j < width; j++) {
 713         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 714       }
 715       src += stride;
 716       dst += stride;
 717     }
 718 }
 719
 720 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 721     int i,j;
 722     for (i=0; i < height; i++) {
 723       for (j=0; j < width; j++) {
 724         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 725       }
 726       src += stride;
 727       dst += stride;
 728     }
 729 }
 730
 731 #define QPEL_MC(r, OPNAME, RND, OP) \
 732 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 733     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 734     int i;\
 735     for(i=0; i<h; i++)\
 736     {\
 737         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 738         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 739         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 740         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 741         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 742         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 743         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 744         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 745         dst+=dstStride;\
 746         src+=srcStride;\
 747     }\
 748 }\
 749 \
 750 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 751     const int w=8;\
 752     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 753     int i;\
 754     for(i=0; i<w; i++)\
 755     {\
 756         const int src0= src[0*srcStride];\
 757         const int src1= src[1*srcStride];\
 758         const int src2= src[2*srcStride];\
 759         const int src3= src[3*srcStride];\
 760         const int src4= src[4*srcStride];\
 761         const int src5= src[5*srcStride];\
 762         const int src6= src[6*srcStride];\
 763         const int src7= src[7*srcStride];\
 764         const int src8= src[8*srcStride];\
 765         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 766         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 767         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 768         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 769         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 770         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 771         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 772         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 773         dst++;\
 774         src++;\
 775     }\
 776 }\
 777 \
 778 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 779     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 780     int i;\
 781     \
 782     for(i=0; i<h; i++)\
 783     {\
 784         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 785         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 786         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 787         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 788         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 789         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 790         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 791         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 792         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 793         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 794         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 795         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 796         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 797         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 798         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 799         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 800         dst+=dstStride;\
 801         src+=srcStride;\
 802     }\
 803 }\
 804 \
 805 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 806     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 807     int i;\
 808     const int w=16;\
 809     for(i=0; i<w; i++)\
 810     {\
 811         const int src0= src[0*srcStride];\
 812         const int src1= src[1*srcStride];\
 813         const int src2= src[2*srcStride];\
 814         const int src3= src[3*srcStride];\
 815         const int src4= src[4*srcStride];\
 816         const int src5= src[5*srcStride];\
 817         const int src6= src[6*srcStride];\
 818         const int src7= src[7*srcStride];\
 819         const int src8= src[8*srcStride];\
 820         const int src9= src[9*srcStride];\
 821         const int src10= src[10*srcStride];\
 822         const int src11= src[11*srcStride];\
 823         const int src12= src[12*srcStride];\
 824         const int src13= src[13*srcStride];\
 825         const int src14= src[14*srcStride];\
 826         const int src15= src[15*srcStride];\
 827         const int src16= src[16*srcStride];\
 828         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 829         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 830         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 831         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 832         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 833         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 834         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 835         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 836         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 837         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 838         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 839         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 840         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 841         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 842         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 843         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 844         dst++;\
 845         src++;\
 846     }\
 847 }\
 848 \
 849 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
 850     uint8_t half[64];\
 851     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 852     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 853 }\
 854 \
 855 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
 856     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 857 }\
 858 \
 859 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
 860     uint8_t half[64];\
 861     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 862     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 863 }\
 864 \
 865 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
 866     uint8_t full[16*9];\
 867     uint8_t half[64];\
 868     copy_block9(full, src, 16, stride, 9);\
 869     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 870     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 871 }\
 872 \
 873 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
 874     uint8_t full[16*9];\
 875     copy_block9(full, src, 16, stride, 9);\
 876     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 877 }\
 878 \
 879 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
 880     uint8_t full[16*9];\
 881     uint8_t half[64];\
 882     copy_block9(full, src, 16, stride, 9);\
 883     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 884     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 885 }\
 886 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
 887     uint8_t full[16*9];\
 888     uint8_t halfH[72];\
 889     uint8_t halfV[64];\
 890     uint8_t halfHV[64];\
 891     copy_block9(full, src, 16, stride, 9);\
 892     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 893     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 894     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 895     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 896 }\
 897 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
 898     uint8_t full[16*9];\
 899     uint8_t halfH[72];\
 900     uint8_t halfHV[64];\
 901     copy_block9(full, src, 16, stride, 9);\
 902     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 903     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 904     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 905     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 906 }\
 907 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
 908     uint8_t full[16*9];\
 909     uint8_t halfH[72];\
 910     uint8_t halfV[64];\
 911     uint8_t halfHV[64];\
 912     copy_block9(full, src, 16, stride, 9);\
 913     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 914     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 915     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 916     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 917 }\
 918 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
 919     uint8_t full[16*9];\
 920     uint8_t halfH[72];\
 921     uint8_t halfHV[64];\
 922     copy_block9(full, src, 16, stride, 9);\
 923     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 924     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 925     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 926     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 927 }\
 928 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
 929     uint8_t full[16*9];\
 930     uint8_t halfH[72];\
 931     uint8_t halfV[64];\
 932     uint8_t halfHV[64];\
 933     copy_block9(full, src, 16, stride, 9);\
 934     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 935     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 936     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 937     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 938 }\
 939 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
 940     uint8_t full[16*9];\
 941     uint8_t halfH[72];\
 942     uint8_t halfHV[64];\
 943     copy_block9(full, src, 16, stride, 9);\
 944     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 945     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 946     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 947     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 948 }\
 949 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
 950     uint8_t full[16*9];\
 951     uint8_t halfH[72];\
 952     uint8_t halfV[64];\
 953     uint8_t halfHV[64];\
 954     copy_block9(full, src, 16, stride, 9);\
 955     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
 956     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 957     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 958     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 959 }\
 960 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
 961     uint8_t full[16*9];\
 962     uint8_t halfH[72];\
 963     uint8_t halfHV[64];\
 964     copy_block9(full, src, 16, stride, 9);\
 965     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 966     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 967     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 968     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 969 }\
 970 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
 971     uint8_t halfH[72];\
 972     uint8_t halfHV[64];\
 973     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
 974     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 975     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 976 }\
 977 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
 978     uint8_t halfH[72];\
 979     uint8_t halfHV[64];\
 980     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
 981     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 982     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 983 }\
 984 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
 985     uint8_t full[16*9];\
 986     uint8_t halfH[72];\
 987     uint8_t halfV[64];\
 988     uint8_t halfHV[64];\
 989     copy_block9(full, src, 16, stride, 9);\
 990     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 991     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 992     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 993     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
 994 }\
 995 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
 996     uint8_t full[16*9];\
 997     uint8_t halfH[72];\
 998     copy_block9(full, src, 16, stride, 9);\
 999     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1000     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1001     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1002 }\
1003 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1004     uint8_t full[16*9];\
1005     uint8_t halfH[72];\
1006     uint8_t halfV[64];\
1007     uint8_t halfHV[64];\
1008     copy_block9(full, src, 16, stride, 9);\
1009     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1010     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1011     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1012     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1013 }\
1014 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1015     uint8_t full[16*9];\
1016     uint8_t halfH[72];\
1017     copy_block9(full, src, 16, stride, 9);\
1018     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1019     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1020     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1021 }\
1022 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1023     uint8_t halfH[72];\
1024     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1025     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1026 }\
1027 \
1028 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1029     uint8_t half[256];\
1030     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1031     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1032 }\
1033 \
1034 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1035     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1036 }\
1037 \
1038 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1039     uint8_t half[256];\
1040     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1041     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1042 }\
1043 \
1044 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1045     uint8_t full[24*17];\
1046     uint8_t half[256];\
1047     copy_block17(full, src, 24, stride, 17);\
1048     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1049     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1050 }\
1051 \
1052 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1053     uint8_t full[24*17];\
1054     copy_block17(full, src, 24, stride, 17);\
1055     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1056 }\
1057 \
1058 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1059     uint8_t full[24*17];\
1060     uint8_t half[256];\
1061     copy_block17(full, src, 24, stride, 17);\
1062     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1063     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1064 }\
1065 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1066     uint8_t full[24*17];\
1067     uint8_t halfH[272];\
1068     uint8_t halfV[256];\
1069     uint8_t halfHV[256];\
1070     copy_block17(full, src, 24, stride, 17);\
1071     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1072     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1073     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1074     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1075 }\
1076 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1077     uint8_t full[24*17];\
1078     uint8_t halfH[272];\
1079     uint8_t halfHV[256];\
1080     copy_block17(full, src, 24, stride, 17);\
1081     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1082     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1083     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1084     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1085 }\
1086 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1087     uint8_t full[24*17];\
1088     uint8_t halfH[272];\
1089     uint8_t halfV[256];\
1090     uint8_t halfHV[256];\
1091     copy_block17(full, src, 24, stride, 17);\
1092     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1093     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1094     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1095     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1096 }\
1097 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1098     uint8_t full[24*17];\
1099     uint8_t halfH[272];\
1100     uint8_t halfHV[256];\
1101     copy_block17(full, src, 24, stride, 17);\
1102     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1103     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1104     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1105     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1106 }\
1107 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1108     uint8_t full[24*17];\
1109     uint8_t halfH[272];\
1110     uint8_t halfV[256];\
1111     uint8_t halfHV[256];\
1112     copy_block17(full, src, 24, stride, 17);\
1113     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1114     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1115     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1116     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1117 }\
1118 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1119     uint8_t full[24*17];\
1120     uint8_t halfH[272];\
1121     uint8_t halfHV[256];\
1122     copy_block17(full, src, 24, stride, 17);\
1123     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1124     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1125     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1126     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1127 }\
1128 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1129     uint8_t full[24*17];\
1130     uint8_t halfH[272];\
1131     uint8_t halfV[256];\
1132     uint8_t halfHV[256];\
1133     copy_block17(full, src, 24, stride, 17);\
1134     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1135     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1136     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1137     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1138 }\
1139 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1140     uint8_t full[24*17];\
1141     uint8_t halfH[272];\
1142     uint8_t halfHV[256];\
1143     copy_block17(full, src, 24, stride, 17);\
1144     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1145     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1146     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1147     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1148 }\
1149 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1150     uint8_t halfH[272];\
1151     uint8_t halfHV[256];\
1152     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1153     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1154     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1155 }\
1156 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1157     uint8_t halfH[272];\
1158     uint8_t halfHV[256];\
1159     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1160     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1161     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1162 }\
1163 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1164     uint8_t full[24*17];\
1165     uint8_t halfH[272];\
1166     uint8_t halfV[256];\
1167     uint8_t halfHV[256];\
1168     copy_block17(full, src, 24, stride, 17);\
1169     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1170     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1171     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1172     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1173 }\
1174 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1175     uint8_t full[24*17];\
1176     uint8_t halfH[272];\
1177     copy_block17(full, src, 24, stride, 17);\
1178     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1179     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1180     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1181 }\
1182 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1183     uint8_t full[24*17];\
1184     uint8_t halfH[272];\
1185     uint8_t halfV[256];\
1186     uint8_t halfHV[256];\
1187     copy_block17(full, src, 24, stride, 17);\
1188     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1189     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1190     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1191     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1192 }\
1193 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1194     uint8_t full[24*17];\
1195     uint8_t halfH[272];\
1196     copy_block17(full, src, 24, stride, 17);\
1197     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1198     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1199     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1200 }\
1201 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1202     uint8_t halfH[272];\
1203     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1204     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1205 }
1206
1207 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1208 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1209 #define op_put(a, b) a = cm[((b) + 16)>>5]
1210 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1211
1212 QPEL_MC(0, put_       , _       , op_put)
1213 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1214 QPEL_MC(0, avg_       , _       , op_avg)
1215 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1216 #undef op_avg
1217 #undef op_avg_no_rnd
1218 #undef op_put
1219 #undef op_put_no_rnd
1220
1221 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1222 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1223 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1224 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1225 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1226 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1227
1228 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1229     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1230     int i;
1231
1232     for(i=0; i<h; i++){
1233         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1234         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1235         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1236         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1237         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1238         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1239         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1240         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1241         dst+=dstStride;
1242         src+=srcStride;
1243     }
1244 }
1245
1246 #if CONFIG_RV40_DECODER
1247 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1248     put_pixels16_xy2_8_c(dst, src, stride, 16);
1249 }
1250 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1251     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1252 }
1253 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1254     put_pixels8_xy2_8_c(dst, src, stride, 8);
1255 }
1256 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1257     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1258 }
1259 #endif /* CONFIG_RV40_DECODER */
1260
1261 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1262     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1263     int i;
1264
1265     for(i=0; i<w; i++){
1266         const int src_1= src[ -srcStride];
1267         const int src0 = src[0          ];
1268         const int src1 = src[  srcStride];
1269         const int src2 = src[2*srcStride];
1270         const int src3 = src[3*srcStride];
1271         const int src4 = src[4*srcStride];
1272         const int src5 = src[5*srcStride];
1273         const int src6 = src[6*srcStride];
1274         const int src7 = src[7*srcStride];
1275         const int src8 = src[8*srcStride];
1276         const int src9 = src[9*srcStride];
1277         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1278         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1279         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1280         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1281         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1282         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1283         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1284         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1285         src++;
1286         dst++;
1287     }
1288 }
1289
1290 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1291     uint8_t half[64];
1292     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1293     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1294 }
1295
1296 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1297     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1298 }
1299
1300 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1301     uint8_t half[64];
1302     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1303     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1304 }
1305
1306 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1307     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1308 }
1309
1310 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1311     uint8_t halfH[88];
1312     uint8_t halfV[64];
1313     uint8_t halfHV[64];
1314     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1315     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1316     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1317     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1318 }
1319 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1320     uint8_t halfH[88];
1321     uint8_t halfV[64];
1322     uint8_t halfHV[64];
1323     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1324     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1325     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1326     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1327 }
1328 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1329     uint8_t halfH[88];
1330     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1331     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1332 }
1333
1334 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1335     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1336     int x;
1337     const int strength= ff_h263_loop_filter_strength[qscale];
1338
1339     for(x=0; x<8; x++){
1340         int d1, d2, ad1;
1341         int p0= src[x-2*stride];
1342         int p1= src[x-1*stride];
1343         int p2= src[x+0*stride];
1344         int p3= src[x+1*stride];
1345         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1346
1347         if     (d<-2*strength) d1= 0;
1348         else if(d<-  strength) d1=-2*strength - d;
1349         else if(d<   strength) d1= d;
1350         else if(d< 2*strength) d1= 2*strength - d;
1351         else                   d1= 0;
1352
1353         p1 += d1;
1354         p2 -= d1;
1355         if(p1&256) p1= ~(p1>>31);
1356         if(p2&256) p2= ~(p2>>31);
1357
1358         src[x-1*stride] = p1;
1359         src[x+0*stride] = p2;
1360
1361         ad1= FFABS(d1)>>1;
1362
1363         d2= av_clip((p0-p3)/4, -ad1, ad1);
1364
1365         src[x-2*stride] = p0 - d2;
1366         src[x+  stride] = p3 + d2;
1367     }
1368     }
1369 }
1370
1371 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1372     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1373     int y;
1374     const int strength= ff_h263_loop_filter_strength[qscale];
1375
1376     for(y=0; y<8; y++){
1377         int d1, d2, ad1;
1378         int p0= src[y*stride-2];
1379         int p1= src[y*stride-1];
1380         int p2= src[y*stride+0];
1381         int p3= src[y*stride+1];
1382         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1383
1384         if     (d<-2*strength) d1= 0;
1385         else if(d<-  strength) d1=-2*strength - d;
1386         else if(d<   strength) d1= d;
1387         else if(d< 2*strength) d1= 2*strength - d;
1388         else                   d1= 0;
1389
1390         p1 += d1;
1391         p2 -= d1;
1392         if(p1&256) p1= ~(p1>>31);
1393         if(p2&256) p2= ~(p2>>31);
1394
1395         src[y*stride-1] = p1;
1396         src[y*stride+0] = p2;
1397
1398         ad1= FFABS(d1)>>1;
1399
1400         d2= av_clip((p0-p3)/4, -ad1, ad1);
1401
1402         src[y*stride-2] = p0 - d2;
1403         src[y*stride+1] = p3 + d2;
1404     }
1405     }
1406 }
1407
1408 static void h261_loop_filter_c(uint8_t *src, int stride){
1409     int x,y,xy,yz;
1410     int temp[64];
1411
1412     for(x=0; x<8; x++){
1413         temp[x      ] = 4*src[x           ];
1414         temp[x + 7*8] = 4*src[x + 7*stride];
1415     }
1416     for(y=1; y<7; y++){
1417         for(x=0; x<8; x++){
1418             xy = y * stride + x;
1419             yz = y * 8 + x;
1420             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1421         }
1422     }
1423
1424     for(y=0; y<8; y++){
1425         src[  y*stride] = (temp[  y*8] + 2)>>2;
1426         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1427         for(x=1; x<7; x++){
1428             xy = y * stride + x;
1429             yz = y * 8 + x;
1430             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1431         }
1432     }
1433 }
1434
1435 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1436 {
1437     int s, i;
1438
1439     s = 0;
1440     for(i=0;i<h;i++) {
1441         s += abs(pix1[0] - pix2[0]);
1442         s += abs(pix1[1] - pix2[1]);
1443         s += abs(pix1[2] - pix2[2]);
1444         s += abs(pix1[3] - pix2[3]);
1445         s += abs(pix1[4] - pix2[4]);
1446         s += abs(pix1[5] - pix2[5]);
1447         s += abs(pix1[6] - pix2[6]);
1448         s += abs(pix1[7] - pix2[7]);
1449         s += abs(pix1[8] - pix2[8]);
1450         s += abs(pix1[9] - pix2[9]);
1451         s += abs(pix1[10] - pix2[10]);
1452         s += abs(pix1[11] - pix2[11]);
1453         s += abs(pix1[12] - pix2[12]);
1454         s += abs(pix1[13] - pix2[13]);
1455         s += abs(pix1[14] - pix2[14]);
1456         s += abs(pix1[15] - pix2[15]);
1457         pix1 += line_size;
1458         pix2 += line_size;
1459     }
1460     return s;
1461 }
1462
1463 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1464 {
1465     int s, i;
1466
1467     s = 0;
1468     for(i=0;i<h;i++) {
1469         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1470         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1471         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1472         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1473         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1474         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1475         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1476         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1477         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1478         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1479         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1480         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1481         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1482         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1483         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1484         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1485         pix1 += line_size;
1486         pix2 += line_size;
1487     }
1488     return s;
1489 }
1490
1491 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1492 {
1493     int s, i;
1494     uint8_t *pix3 = pix2 + line_size;
1495
1496     s = 0;
1497     for(i=0;i<h;i++) {
1498         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1499         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1500         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1501         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1502         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1503         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1504         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1505         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1506         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1507         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1508         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1509         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1510         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1511         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1512         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1513         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1514         pix1 += line_size;
1515         pix2 += line_size;
1516         pix3 += line_size;
1517     }
1518     return s;
1519 }
1520
1521 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1522 {
1523     int s, i;
1524     uint8_t *pix3 = pix2 + line_size;
1525
1526     s = 0;
1527     for(i=0;i<h;i++) {
1528         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1529         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1530         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1531         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1532         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1533         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1534         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1535         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1536         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1537         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1538         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1539         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1540         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1541         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1542         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1543         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1544         pix1 += line_size;
1545         pix2 += line_size;
1546         pix3 += line_size;
1547     }
1548     return s;
1549 }
1550
1551 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1552 {
1553     int s, i;
1554
1555     s = 0;
1556     for(i=0;i<h;i++) {
1557         s += abs(pix1[0] - pix2[0]);
1558         s += abs(pix1[1] - pix2[1]);
1559         s += abs(pix1[2] - pix2[2]);
1560         s += abs(pix1[3] - pix2[3]);
1561         s += abs(pix1[4] - pix2[4]);
1562         s += abs(pix1[5] - pix2[5]);
1563         s += abs(pix1[6] - pix2[6]);
1564         s += abs(pix1[7] - pix2[7]);
1565         pix1 += line_size;
1566         pix2 += line_size;
1567     }
1568     return s;
1569 }
1570
1571 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1572 {
1573     int s, i;
1574
1575     s = 0;
1576     for(i=0;i<h;i++) {
1577         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1578         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1579         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1580         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1581         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1582         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1583         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1584         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1585         pix1 += line_size;
1586         pix2 += line_size;
1587     }
1588     return s;
1589 }
1590
1591 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1592 {
1593     int s, i;
1594     uint8_t *pix3 = pix2 + line_size;
1595
1596     s = 0;
1597     for(i=0;i<h;i++) {
1598         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1599         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1600         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1601         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1602         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1603         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1604         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1605         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1606         pix1 += line_size;
1607         pix2 += line_size;
1608         pix3 += line_size;
1609     }
1610     return s;
1611 }
1612
1613 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1614 {
1615     int s, i;
1616     uint8_t *pix3 = pix2 + line_size;
1617
1618     s = 0;
1619     for(i=0;i<h;i++) {
1620         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1621         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1622         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1623         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1624         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1625         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1626         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1627         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1628         pix1 += line_size;
1629         pix2 += line_size;
1630         pix3 += line_size;
1631     }
1632     return s;
1633 }
1634
1635 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1636     MpegEncContext *c = v;
1637     int score1=0;
1638     int score2=0;
1639     int x,y;
1640
1641     for(y=0; y<h; y++){
1642         for(x=0; x<16; x++){
1643             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1644         }
1645         if(y+1<h){
1646             for(x=0; x<15; x++){
1647                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1648                              - s1[x+1] + s1[x+1+stride])
1649                         -FFABS(  s2[x  ] - s2[x  +stride]
1650                              - s2[x+1] + s2[x+1+stride]);
1651             }
1652         }
1653         s1+= stride;
1654         s2+= stride;
1655     }
1656
1657     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1658     else  return score1 + FFABS(score2)*8;
1659 }
1660
1661 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1662     MpegEncContext *c = v;
1663     int score1=0;
1664     int score2=0;
1665     int x,y;
1666
1667     for(y=0; y<h; y++){
1668         for(x=0; x<8; x++){
1669             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1670         }
1671         if(y+1<h){
1672             for(x=0; x<7; x++){
1673                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1674                              - s1[x+1] + s1[x+1+stride])
1675                         -FFABS(  s2[x  ] - s2[x  +stride]
1676                              - s2[x+1] + s2[x+1+stride]);
1677             }
1678         }
1679         s1+= stride;
1680         s2+= stride;
1681     }
1682
1683     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1684     else  return score1 + FFABS(score2)*8;
1685 }
1686
1687 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1688     int i;
1689     unsigned int sum=0;
1690
1691     for(i=0; i<8*8; i++){
1692         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1693         int w= weight[i];
1694         b>>= RECON_SHIFT;
1695         assert(-512<b && b<512);
1696
1697         sum += (w*b)*(w*b)>>4;
1698     }
1699     return sum>>2;
1700 }
1701
1702 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1703     int i;
1704
1705     for(i=0; i<8*8; i++){
1706         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1707     }
1708 }
1709
1710 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1711     return 0;
1712 }
1713
1714 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1715     int i;
1716
1717     memset(cmp, 0, sizeof(void*)*6);
1718
1719     for(i=0; i<6; i++){
1720         switch(type&0xFF){
1721         case FF_CMP_SAD:
1722             cmp[i]= c->sad[i];
1723             break;
1724         case FF_CMP_SATD:
1725             cmp[i]= c->hadamard8_diff[i];
1726             break;
1727         case FF_CMP_SSE:
1728             cmp[i]= c->sse[i];
1729             break;
1730         case FF_CMP_DCT:
1731             cmp[i]= c->dct_sad[i];
1732             break;
1733         case FF_CMP_DCT264:
1734             cmp[i]= c->dct264_sad[i];
1735             break;
1736         case FF_CMP_DCTMAX:
1737             cmp[i]= c->dct_max[i];
1738             break;
1739         case FF_CMP_PSNR:
1740             cmp[i]= c->quant_psnr[i];
1741             break;
1742         case FF_CMP_BIT:
1743             cmp[i]= c->bit[i];
1744             break;
1745         case FF_CMP_RD:
1746             cmp[i]= c->rd[i];
1747             break;
1748         case FF_CMP_VSAD:
1749             cmp[i]= c->vsad[i];
1750             break;
1751         case FF_CMP_VSSE:
1752             cmp[i]= c->vsse[i];
1753             break;
1754         case FF_CMP_ZERO:
1755             cmp[i]= zero_cmp;
1756             break;
1757         case FF_CMP_NSSE:
1758             cmp[i]= c->nsse[i];
1759             break;
1760         default:
1761             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1762         }
1763     }
1764 }
1765
1766 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1767     long i;
1768     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1769         long a = *(long*)(src+i);
1770         long b = *(long*)(dst+i);
1771         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1772     }
1773     for(; i<w; i++)
1774         dst[i+0] += src[i+0];
1775 }
1776
1777 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1778     long i;
1779 #if !HAVE_FAST_UNALIGNED
1780     if((long)src2 & (sizeof(long)-1)){
1781         for(i=0; i+7<w; i+=8){
1782             dst[i+0] = src1[i+0]-src2[i+0];
1783             dst[i+1] = src1[i+1]-src2[i+1];
1784             dst[i+2] = src1[i+2]-src2[i+2];
1785             dst[i+3] = src1[i+3]-src2[i+3];
1786             dst[i+4] = src1[i+4]-src2[i+4];
1787             dst[i+5] = src1[i+5]-src2[i+5];
1788             dst[i+6] = src1[i+6]-src2[i+6];
1789             dst[i+7] = src1[i+7]-src2[i+7];
1790         }
1791     }else
1792 #endif
1793     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1794         long a = *(long*)(src1+i);
1795         long b = *(long*)(src2+i);
1796         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1797     }
1798     for(; i<w; i++)
1799         dst[i+0] = src1[i+0]-src2[i+0];
1800 }
1801
1802 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1803     int i;
1804     uint8_t l, lt;
1805
1806     l= *left;
1807     lt= *left_top;
1808
1809     for(i=0; i<w; i++){
1810         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1811         lt= src1[i];
1812         dst[i]= l;
1813     }
1814
1815     *left= l;
1816     *left_top= lt;
1817 }
1818
1819 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1820     int i;
1821     uint8_t l, lt;
1822
1823     l= *left;
1824     lt= *left_top;
1825
1826     for(i=0; i<w; i++){
1827         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1828         lt= src1[i];
1829         l= src2[i];
1830         dst[i]= l - pred;
1831     }
1832
1833     *left= l;
1834     *left_top= lt;
1835 }
1836
1837 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1838     int i;
1839
1840     for(i=0; i<w-1; i++){
1841         acc+= src[i];
1842         dst[i]= acc;
1843         i++;
1844         acc+= src[i];
1845         dst[i]= acc;
1846     }
1847
1848     for(; i<w; i++){
1849         acc+= src[i];
1850         dst[i]= acc;
1851     }
1852
1853     return acc;
1854 }
1855
1856 #if HAVE_BIGENDIAN
1857 #define B 3
1858 #define G 2
1859 #define R 1
1860 #define A 0
1861 #else
1862 #define B 0
1863 #define G 1
1864 #define R 2
1865 #define A 3
1866 #endif
1867 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1868     int i;
1869     int r,g,b,a;
1870     r= *red;
1871     g= *green;
1872     b= *blue;
1873     a= *alpha;
1874
1875     for(i=0; i<w; i++){
1876         b+= src[4*i+B];
1877         g+= src[4*i+G];
1878         r+= src[4*i+R];
1879         a+= src[4*i+A];
1880
1881         dst[4*i+B]= b;
1882         dst[4*i+G]= g;
1883         dst[4*i+R]= r;
1884         dst[4*i+A]= a;
1885     }
1886
1887     *red= r;
1888     *green= g;
1889     *blue= b;
1890     *alpha= a;
1891 }
1892 #undef B
1893 #undef G
1894 #undef R
1895 #undef A
1896
1897 #define BUTTERFLY2(o1,o2,i1,i2) \
1898 o1= (i1)+(i2);\
1899 o2= (i1)-(i2);
1900
1901 #define BUTTERFLY1(x,y) \
1902 {\
1903     int a,b;\
1904     a= x;\
1905     b= y;\
1906     x= a+b;\
1907     y= a-b;\
1908 }
1909
1910 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1911
1912 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1913     int i;
1914     int temp[64];
1915     int sum=0;
1916
1917     assert(h==8);
1918
1919     for(i=0; i<8; i++){
1920         //FIXME try pointer walks
1921         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1922         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1923         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1924         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1925
1926         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1927         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1928         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1929         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1930
1931         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1932         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1933         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1934         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1935     }
1936
1937     for(i=0; i<8; i++){
1938         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1939         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1940         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1941         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1942
1943         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1944         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1945         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1946         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1947
1948         sum +=
1949              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1950             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1951             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1952             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1953     }
1954     return sum;
1955 }
1956
1957 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1958     int i;
1959     int temp[64];
1960     int sum=0;
1961
1962     assert(h==8);
1963
1964     for(i=0; i<8; i++){
1965         //FIXME try pointer walks
1966         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
1967         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
1968         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
1969         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
1970
1971         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1972         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1973         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1974         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1975
1976         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1977         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1978         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1979         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1980     }
1981
1982     for(i=0; i<8; i++){
1983         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1984         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1985         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1986         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1987
1988         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1989         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1990         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1991         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1992
1993         sum +=
1994              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1995             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1996             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1997             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1998     }
1999
2000     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2001
2002     return sum;
2003 }
2004
2005 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2006     MpegEncContext * const s= (MpegEncContext *)c;
2007     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2008
2009     assert(h==8);
2010
2011     s->dsp.diff_pixels(temp, src1, src2, stride);
2012     s->dsp.fdct(temp);
2013     return s->dsp.sum_abs_dctelem(temp);
2014 }
2015
2016 #if CONFIG_GPL
2017 #define DCT8_1D {\
2018     const int s07 = SRC(0) + SRC(7);\
2019     const int s16 = SRC(1) + SRC(6);\
2020     const int s25 = SRC(2) + SRC(5);\
2021     const int s34 = SRC(3) + SRC(4);\
2022     const int a0 = s07 + s34;\
2023     const int a1 = s16 + s25;\
2024     const int a2 = s07 - s34;\
2025     const int a3 = s16 - s25;\
2026     const int d07 = SRC(0) - SRC(7);\
2027     const int d16 = SRC(1) - SRC(6);\
2028     const int d25 = SRC(2) - SRC(5);\
2029     const int d34 = SRC(3) - SRC(4);\
2030     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2031     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2032     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2033     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2034     DST(0,  a0 + a1     ) ;\
2035     DST(1,  a4 + (a7>>2)) ;\
2036     DST(2,  a2 + (a3>>1)) ;\
2037     DST(3,  a5 + (a6>>2)) ;\
2038     DST(4,  a0 - a1     ) ;\
2039     DST(5,  a6 - (a5>>2)) ;\
2040     DST(6, (a2>>1) - a3 ) ;\
2041     DST(7, (a4>>2) - a7 ) ;\
2042 }
2043
2044 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2045     MpegEncContext * const s= (MpegEncContext *)c;
2046     int16_t dct[8][8];
2047     int i;
2048     int sum=0;
2049
2050     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2051
2052 #define SRC(x) dct[i][x]
2053 #define DST(x,v) dct[i][x]= v
2054     for( i = 0; i < 8; i++ )
2055         DCT8_1D
2056 #undef SRC
2057 #undef DST
2058
2059 #define SRC(x) dct[x][i]
2060 #define DST(x,v) sum += FFABS(v)
2061     for( i = 0; i < 8; i++ )
2062         DCT8_1D
2063 #undef SRC
2064 #undef DST
2065     return sum;
2066 }
2067 #endif
2068
2069 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2070     MpegEncContext * const s= (MpegEncContext *)c;
2071     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2072     int sum=0, i;
2073
2074     assert(h==8);
2075
2076     s->dsp.diff_pixels(temp, src1, src2, stride);
2077     s->dsp.fdct(temp);
2078
2079     for(i=0; i<64; i++)
2080         sum= FFMAX(sum, FFABS(temp[i]));
2081
2082     return sum;
2083 }
2084
2085 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2086     MpegEncContext * const s= (MpegEncContext *)c;
2087     LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2088     int16_t * const bak = temp+64;
2089     int sum=0, i;
2090
2091     assert(h==8);
2092     s->mb_intra=0;
2093
2094     s->dsp.diff_pixels(temp, src1, src2, stride);
2095
2096     memcpy(bak, temp, 64*sizeof(int16_t));
2097
2098     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2099     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2100     ff_simple_idct_8(temp); //FIXME
2101
2102     for(i=0; i<64; i++)
2103         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2104
2105     return sum;
2106 }
2107
2108 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2109     MpegEncContext * const s= (MpegEncContext *)c;
2110     const uint8_t *scantable= s->intra_scantable.permutated;
2111     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2112     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2113     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2114     int i, last, run, bits, level, distortion, start_i;
2115     const int esc_length= s->ac_esc_length;
2116     uint8_t * length;
2117     uint8_t * last_length;
2118
2119     assert(h==8);
2120
2121     copy_block8(lsrc1, src1, 8, stride, 8);
2122     copy_block8(lsrc2, src2, 8, stride, 8);
2123
2124     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2125
2126     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2127
2128     bits=0;
2129
2130     if (s->mb_intra) {
2131         start_i = 1;
2132         length     = s->intra_ac_vlc_length;
2133         last_length= s->intra_ac_vlc_last_length;
2134         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2135     } else {
2136         start_i = 0;
2137         length     = s->inter_ac_vlc_length;
2138         last_length= s->inter_ac_vlc_last_length;
2139     }
2140
2141     if(last>=start_i){
2142         run=0;
2143         for(i=start_i; i<last; i++){
2144             int j= scantable[i];
2145             level= temp[j];
2146
2147             if(level){
2148                 level+=64;
2149                 if((level&(~127)) == 0){
2150                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2151                 }else
2152                     bits+= esc_length;
2153                 run=0;
2154             }else
2155                 run++;
2156         }
2157         i= scantable[last];
2158
2159         level= temp[i] + 64;
2160
2161         assert(level - 64);
2162
2163         if((level&(~127)) == 0){
2164             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2165         }else
2166             bits+= esc_length;
2167
2168     }
2169
2170     if(last>=0){
2171         if(s->mb_intra)
2172             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2173         else
2174             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2175     }
2176
2177     s->dsp.idct_add(lsrc2, 8, temp);
2178
2179     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2180
2181     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2182 }
2183
2184 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2185     MpegEncContext * const s= (MpegEncContext *)c;
2186     const uint8_t *scantable= s->intra_scantable.permutated;
2187     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2188     int i, last, run, bits, level, start_i;
2189     const int esc_length= s->ac_esc_length;
2190     uint8_t * length;
2191     uint8_t * last_length;
2192
2193     assert(h==8);
2194
2195     s->dsp.diff_pixels(temp, src1, src2, stride);
2196
2197     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2198
2199     bits=0;
2200
2201     if (s->mb_intra) {
2202         start_i = 1;
2203         length     = s->intra_ac_vlc_length;
2204         last_length= s->intra_ac_vlc_last_length;
2205         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2206     } else {
2207         start_i = 0;
2208         length     = s->inter_ac_vlc_length;
2209         last_length= s->inter_ac_vlc_last_length;
2210     }
2211
2212     if(last>=start_i){
2213         run=0;
2214         for(i=start_i; i<last; i++){
2215             int j= scantable[i];
2216             level= temp[j];
2217
2218             if(level){
2219                 level+=64;
2220                 if((level&(~127)) == 0){
2221                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2222                 }else
2223                     bits+= esc_length;
2224                 run=0;
2225             }else
2226                 run++;
2227         }
2228         i= scantable[last];
2229
2230         level= temp[i] + 64;
2231
2232         assert(level - 64);
2233
2234         if((level&(~127)) == 0){
2235             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2236         }else
2237             bits+= esc_length;
2238     }
2239
2240     return bits;
2241 }
2242
2243 #define VSAD_INTRA(size) \
2244 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2245     int score=0;                                                                                            \
2246     int x,y;                                                                                                \
2247                                                                                                             \
2248     for(y=1; y<h; y++){                                                                                     \
2249         for(x=0; x<size; x+=4){                                                                             \
2250             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2251                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2252         }                                                                                                   \
2253         s+= stride;                                                                                         \
2254     }                                                                                                       \
2255                                                                                                             \
2256     return score;                                                                                           \
2257 }
2258 VSAD_INTRA(8)
2259 VSAD_INTRA(16)
2260
2261 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2262     int score=0;
2263     int x,y;
2264
2265     for(y=1; y<h; y++){
2266         for(x=0; x<16; x++){
2267             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2268         }
2269         s1+= stride;
2270         s2+= stride;
2271     }
2272
2273     return score;
2274 }
2275
2276 #define SQ(a) ((a)*(a))
2277 #define VSSE_INTRA(size) \
2278 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2279     int score=0;                                                                                            \
2280     int x,y;                                                                                                \
2281                                                                                                             \
2282     for(y=1; y<h; y++){                                                                                     \
2283         for(x=0; x<size; x+=4){                                                                               \
2284             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2285                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2286         }                                                                                                   \
2287         s+= stride;                                                                                         \
2288     }                                                                                                       \
2289                                                                                                             \
2290     return score;                                                                                           \
2291 }
2292 VSSE_INTRA(8)
2293 VSSE_INTRA(16)
2294
2295 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2296     int score=0;
2297     int x,y;
2298
2299     for(y=1; y<h; y++){
2300         for(x=0; x<16; x++){
2301             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2302         }
2303         s1+= stride;
2304         s2+= stride;
2305     }
2306
2307     return score;
2308 }
2309
2310 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2311                                int size){
2312     int score=0;
2313     int i;
2314     for(i=0; i<size; i++)
2315         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2316     return score;
2317 }
2318
2319 #define WRAPPER8_16_SQ(name8, name16)\
2320 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2321     int score=0;\
2322     score +=name8(s, dst           , src           , stride, 8);\
2323     score +=name8(s, dst+8         , src+8         , stride, 8);\
2324     if(h==16){\
2325         dst += 8*stride;\
2326         src += 8*stride;\
2327         score +=name8(s, dst           , src           , stride, 8);\
2328         score +=name8(s, dst+8         , src+8         , stride, 8);\
2329     }\
2330     return score;\
2331 }
2332
2333 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2334 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2335 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2336 #if CONFIG_GPL
2337 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2338 #endif
2339 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2340 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2341 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2342 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2343
2344 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2345                    uint32_t maxi, uint32_t maxisign)
2346 {
2347
2348     if(a > mini) return mini;
2349     else if((a^(1U<<31)) > maxisign) return maxi;
2350     else return a;
2351 }
2352
2353 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2354     int i;
2355     uint32_t mini = *(uint32_t*)min;
2356     uint32_t maxi = *(uint32_t*)max;
2357     uint32_t maxisign = maxi ^ (1U<<31);
2358     uint32_t *dsti = (uint32_t*)dst;
2359     const uint32_t *srci = (const uint32_t*)src;
2360     for(i=0; i<len; i+=8) {
2361         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2362         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2363         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2364         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2365         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2366         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2367         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2368         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2369     }
2370 }
2371 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2372     int i;
2373     if(min < 0 && max > 0) {
2374         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2375     } else {
2376         for(i=0; i < len; i+=8) {
2377             dst[i    ] = av_clipf(src[i    ], min, max);
2378             dst[i + 1] = av_clipf(src[i + 1], min, max);
2379             dst[i + 2] = av_clipf(src[i + 2], min, max);
2380             dst[i + 3] = av_clipf(src[i + 3], min, max);
2381             dst[i + 4] = av_clipf(src[i + 4], min, max);
2382             dst[i + 5] = av_clipf(src[i + 5], min, max);
2383             dst[i + 6] = av_clipf(src[i + 6], min, max);
2384             dst[i + 7] = av_clipf(src[i + 7], min, max);
2385         }
2386     }
2387 }
2388
2389 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2390 {
2391     int res = 0;
2392
2393     while (order--)
2394         res += *v1++ * *v2++;
2395
2396     return res;
2397 }
2398
2399 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2400 {
2401     int res = 0;
2402     while (order--) {
2403         res   += *v1 * *v2++;
2404         *v1++ += mul * *v3++;
2405     }
2406     return res;
2407 }
2408
2409 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2410                                  const int16_t *window, unsigned int len)
2411 {
2412     int i;
2413     int len2 = len >> 1;
2414
2415     for (i = 0; i < len2; i++) {
2416         int16_t w       = window[i];
2417         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2418         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2419     }
2420 }
2421
2422 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2423                                 int32_t max, unsigned int len)
2424 {
2425     do {
2426         *dst++ = av_clip(*src++, min, max);
2427         *dst++ = av_clip(*src++, min, max);
2428         *dst++ = av_clip(*src++, min, max);
2429         *dst++ = av_clip(*src++, min, max);
2430         *dst++ = av_clip(*src++, min, max);
2431         *dst++ = av_clip(*src++, min, max);
2432         *dst++ = av_clip(*src++, min, max);
2433         *dst++ = av_clip(*src++, min, max);
2434         len -= 8;
2435     } while (len > 0);
2436 }
2437
2438 static void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2439 {
2440     ff_j_rev_dct (block);
2441     put_pixels_clamped_c(block, dest, line_size);
2442 }
2443 static void ff_jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2444 {
2445     ff_j_rev_dct (block);
2446     add_pixels_clamped_c(block, dest, line_size);
2447 }
2448
2449 /* init static data */
2450 av_cold void ff_dsputil_static_init(void)
2451 {
2452     int i;
2453
2454     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2455     for(i=0;i<MAX_NEG_CROP;i++) {
2456         ff_cropTbl[i] = 0;
2457         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2458     }
2459
2460     for(i=0;i<512;i++) {
2461         ff_squareTbl[i] = (i - 256) * (i - 256);
2462     }
2463
2464     for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2465 }
2466
2467 int ff_check_alignment(void){
2468     static int did_fail=0;
2469     LOCAL_ALIGNED_16(int, aligned, [4]);
2470
2471     if((intptr_t)aligned & 15){
2472         if(!did_fail){
2473 #if HAVE_MMX || HAVE_ALTIVEC
2474             av_log(NULL, AV_LOG_ERROR,
2475                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2476                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2477                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2478                 "Do not report crashes to Libav developers.\n");
2479 #endif
2480             did_fail=1;
2481         }
2482         return -1;
2483     }
2484     return 0;
2485 }
2486
2487 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2488 {
2489     ff_check_alignment();
2490
2491 #if CONFIG_ENCODERS
2492     if (avctx->bits_per_raw_sample == 10) {
2493         c->fdct    = ff_jpeg_fdct_islow_10;
2494         c->fdct248 = ff_fdct248_islow_10;
2495     } else {
2496         if(avctx->dct_algo==FF_DCT_FASTINT) {
2497             c->fdct    = ff_fdct_ifast;
2498             c->fdct248 = ff_fdct_ifast248;
2499         }
2500         else if(avctx->dct_algo==FF_DCT_FAAN) {
2501             c->fdct    = ff_faandct;
2502             c->fdct248 = ff_faandct248;
2503         }
2504         else {
2505             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2506             c->fdct248 = ff_fdct248_islow_8;
2507         }
2508     }
2509 #endif //CONFIG_ENCODERS
2510
2511     if (avctx->bits_per_raw_sample == 10) {
2512         c->idct_put              = ff_simple_idct_put_10;
2513         c->idct_add              = ff_simple_idct_add_10;
2514         c->idct                  = ff_simple_idct_10;
2515         c->idct_permutation_type = FF_NO_IDCT_PERM;
2516     } else {
2517         if(avctx->idct_algo==FF_IDCT_INT){
2518             c->idct_put= ff_jref_idct_put;
2519             c->idct_add= ff_jref_idct_add;
2520             c->idct    = ff_j_rev_dct;
2521             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2522         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2523             c->idct_put= ff_faanidct_put;
2524             c->idct_add= ff_faanidct_add;
2525             c->idct    = ff_faanidct;
2526             c->idct_permutation_type= FF_NO_IDCT_PERM;
2527         }else{ //accurate/default
2528             c->idct_put = ff_simple_idct_put_8;
2529             c->idct_add = ff_simple_idct_add_8;
2530             c->idct     = ff_simple_idct_8;
2531             c->idct_permutation_type= FF_NO_IDCT_PERM;
2532         }
2533     }
2534
2535     c->diff_pixels = diff_pixels_c;
2536     c->put_pixels_clamped = put_pixels_clamped_c;
2537     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2538     c->add_pixels_clamped = add_pixels_clamped_c;
2539     c->sum_abs_dctelem = sum_abs_dctelem_c;
2540     c->gmc1 = gmc1_c;
2541     c->gmc = ff_gmc_c;
2542     c->pix_sum = pix_sum_c;
2543     c->pix_norm1 = pix_norm1_c;
2544
2545     c->fill_block_tab[0] = fill_block16_c;
2546     c->fill_block_tab[1] = fill_block8_c;
2547
2548     /* TODO [0] 16  [1] 8 */
2549     c->pix_abs[0][0] = pix_abs16_c;
2550     c->pix_abs[0][1] = pix_abs16_x2_c;
2551     c->pix_abs[0][2] = pix_abs16_y2_c;
2552     c->pix_abs[0][3] = pix_abs16_xy2_c;
2553     c->pix_abs[1][0] = pix_abs8_c;
2554     c->pix_abs[1][1] = pix_abs8_x2_c;
2555     c->pix_abs[1][2] = pix_abs8_y2_c;
2556     c->pix_abs[1][3] = pix_abs8_xy2_c;
2557
2558     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2559     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2560     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2561     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2562     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2563     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2564     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2565     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2566     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2567
2568     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2569     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2570     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2571     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2572     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2573     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2574     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2575     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2576     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2577
2578 #define dspfunc(PFX, IDX, NUM) \
2579     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2580     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2581     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2582     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2583     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2584     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2585     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2586     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2587     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2588     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2589     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2590     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2591     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2592     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2593     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2594     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2595
2596     dspfunc(put_qpel, 0, 16);
2597     dspfunc(put_no_rnd_qpel, 0, 16);
2598
2599     dspfunc(avg_qpel, 0, 16);
2600     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2601
2602     dspfunc(put_qpel, 1, 8);
2603     dspfunc(put_no_rnd_qpel, 1, 8);
2604
2605     dspfunc(avg_qpel, 1, 8);
2606     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2607
2608 #undef dspfunc
2609
2610     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2611     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2612     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2613     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2614     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2615     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2616     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2617     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2618
2619 #define SET_CMP_FUNC(name) \
2620     c->name[0]= name ## 16_c;\
2621     c->name[1]= name ## 8x8_c;
2622
2623     SET_CMP_FUNC(hadamard8_diff)
2624     c->hadamard8_diff[4]= hadamard8_intra16_c;
2625     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2626     SET_CMP_FUNC(dct_sad)
2627     SET_CMP_FUNC(dct_max)
2628 #if CONFIG_GPL
2629     SET_CMP_FUNC(dct264_sad)
2630 #endif
2631     c->sad[0]= pix_abs16_c;
2632     c->sad[1]= pix_abs8_c;
2633     c->sse[0]= sse16_c;
2634     c->sse[1]= sse8_c;
2635     c->sse[2]= sse4_c;
2636     SET_CMP_FUNC(quant_psnr)
2637     SET_CMP_FUNC(rd)
2638     SET_CMP_FUNC(bit)
2639     c->vsad[0]= vsad16_c;
2640     c->vsad[4]= vsad_intra16_c;
2641     c->vsad[5]= vsad_intra8_c;
2642     c->vsse[0]= vsse16_c;
2643     c->vsse[4]= vsse_intra16_c;
2644     c->vsse[5]= vsse_intra8_c;
2645     c->nsse[0]= nsse16_c;
2646     c->nsse[1]= nsse8_c;
2647
2648     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2649
2650     c->add_bytes= add_bytes_c;
2651     c->diff_bytes= diff_bytes_c;
2652     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2653     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2654     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
2655     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2656     c->bswap_buf= bswap_buf;
2657     c->bswap16_buf = bswap16_buf;
2658
2659     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2660         c->h263_h_loop_filter= h263_h_loop_filter_c;
2661         c->h263_v_loop_filter= h263_v_loop_filter_c;
2662     }
2663
2664     c->h261_loop_filter= h261_loop_filter_c;
2665
2666     c->try_8x8basis= try_8x8basis_c;
2667     c->add_8x8basis= add_8x8basis_c;
2668
2669     c->vector_clipf = vector_clipf_c;
2670     c->scalarproduct_int16 = scalarproduct_int16_c;
2671     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2672     c->apply_window_int16 = apply_window_int16_c;
2673     c->vector_clip_int32 = vector_clip_int32_c;
2674
2675     c->shrink[0]= av_image_copy_plane;
2676     c->shrink[1]= ff_shrink22;
2677     c->shrink[2]= ff_shrink44;
2678     c->shrink[3]= ff_shrink88;
2679
2680 #define hpel_funcs(prefix, idx, num) \
2681     c->prefix ## _pixels_tab idx [0] = prefix ## _pixels ## num ## _8_c; \
2682     c->prefix ## _pixels_tab idx [1] = prefix ## _pixels ## num ## _x2_8_c; \
2683     c->prefix ## _pixels_tab idx [2] = prefix ## _pixels ## num ## _y2_8_c; \
2684     c->prefix ## _pixels_tab idx [3] = prefix ## _pixels ## num ## _xy2_8_c
2685
2686     hpel_funcs(put, [0], 16);
2687     hpel_funcs(put, [1],  8);
2688     hpel_funcs(put, [2],  4);
2689     hpel_funcs(put, [3],  2);
2690     hpel_funcs(put_no_rnd, [0], 16);
2691     hpel_funcs(put_no_rnd, [1],  8);
2692     hpel_funcs(avg, [0], 16);
2693     hpel_funcs(avg, [1],  8);
2694     hpel_funcs(avg, [2],  4);
2695     hpel_funcs(avg, [3],  2);
2696     hpel_funcs(avg_no_rnd,, 16);
2697
2698 #undef FUNC
2699 #undef FUNCC
2700 #define FUNC(f, depth) f ## _ ## depth
2701 #define FUNCC(f, depth) f ## _ ## depth ## _c
2702
2703 #define BIT_DEPTH_FUNCS(depth, dct)\
2704     c->get_pixels                    = FUNCC(get_pixels   ## dct   , depth);\
2705     c->draw_edges                    = FUNCC(draw_edges            , depth);\
2706     c->clear_block                   = FUNCC(clear_block  ## dct   , depth);\
2707     c->clear_blocks                  = FUNCC(clear_blocks ## dct   , depth);\
2708     c->add_pixels8                   = FUNCC(add_pixels8  ## dct   , depth);\
2709     c->add_pixels4                   = FUNCC(add_pixels4  ## dct   , depth);\
2710
2711     switch (avctx->bits_per_raw_sample) {
2712     case 9:
2713         if (c->dct_bits == 32) {
2714             BIT_DEPTH_FUNCS(9, _32);
2715         } else {
2716             BIT_DEPTH_FUNCS(9, _16);
2717         }
2718         break;
2719     case 10:
2720         if (c->dct_bits == 32) {
2721             BIT_DEPTH_FUNCS(10, _32);
2722         } else {
2723             BIT_DEPTH_FUNCS(10, _16);
2724         }
2725         break;
2726     default:
2727         BIT_DEPTH_FUNCS(8, _16);
2728         break;
2729     }
2730
2731
2732     if (HAVE_MMX)        ff_dsputil_init_mmx   (c, avctx);
2733     if (ARCH_ARM)        ff_dsputil_init_arm   (c, avctx);
2734     if (HAVE_VIS)        ff_dsputil_init_vis   (c, avctx);
2735     if (ARCH_ALPHA)      ff_dsputil_init_alpha (c, avctx);
2736     if (ARCH_PPC)        ff_dsputil_init_ppc   (c, avctx);
2737     if (ARCH_SH4)        ff_dsputil_init_sh4   (c, avctx);
2738     if (ARCH_BFIN)       ff_dsputil_init_bfin  (c, avctx);
2739
2740     ff_init_scantable_permutation(c->idct_permutation,
2741                                   c->idct_permutation_type);
2742 }