git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39
  40 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  41 uint32_t ff_squareTbl[512] = {0, };
  42
  43 #define BIT_DEPTH 9
  44 #include "dsputil_template.c"
  45 #undef BIT_DEPTH
  46
  47 #define BIT_DEPTH 10
  48 #include "dsputil_template.c"
  49 #undef BIT_DEPTH
  50
  51 #define BIT_DEPTH 8
  52 #include "dsputil_template.c"
  53
  54 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  55 #define pb_7f (~0UL/255 * 0x7f)
  56 #define pb_80 (~0UL/255 * 0x80)
  57
  58 const uint8_t ff_zigzag_direct[64] = {
  59     0,   1,  8, 16,  9,  2,  3, 10,
  60     17, 24, 32, 25, 18, 11,  4,  5,
  61     12, 19, 26, 33, 40, 48, 41, 34,
  62     27, 20, 13,  6,  7, 14, 21, 28,
  63     35, 42, 49, 56, 57, 50, 43, 36,
  64     29, 22, 15, 23, 30, 37, 44, 51,
  65     58, 59, 52, 45, 38, 31, 39, 46,
  66     53, 60, 61, 54, 47, 55, 62, 63
  67 };
  68
  69 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  70    specification, we interleave the fields */
  71 const uint8_t ff_zigzag248_direct[64] = {
  72      0,  8,  1,  9, 16, 24,  2, 10,
  73     17, 25, 32, 40, 48, 56, 33, 41,
  74     18, 26,  3, 11,  4, 12, 19, 27,
  75     34, 42, 49, 57, 50, 58, 35, 43,
  76     20, 28,  5, 13,  6, 14, 21, 29,
  77     36, 44, 51, 59, 52, 60, 37, 45,
  78     22, 30,  7, 15, 23, 31, 38, 46,
  79     53, 61, 54, 62, 39, 47, 55, 63,
  80 };
  81
  82 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  83 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
  84
  85 const uint8_t ff_alternate_horizontal_scan[64] = {
  86     0,  1,   2,  3,  8,  9, 16, 17,
  87     10, 11,  4,  5,  6,  7, 15, 14,
  88     13, 12, 19, 18, 24, 25, 32, 33,
  89     26, 27, 20, 21, 22, 23, 28, 29,
  90     30, 31, 34, 35, 40, 41, 48, 49,
  91     42, 43, 36, 37, 38, 39, 44, 45,
  92     46, 47, 50, 51, 56, 57, 58, 59,
  93     52, 53, 54, 55, 60, 61, 62, 63,
  94 };
  95
  96 const uint8_t ff_alternate_vertical_scan[64] = {
  97     0,  8,  16, 24,  1,  9,  2, 10,
  98     17, 25, 32, 40, 48, 56, 57, 49,
  99     41, 33, 26, 18,  3, 11,  4, 12,
 100     19, 27, 34, 42, 50, 58, 35, 43,
 101     51, 59, 20, 28,  5, 13,  6, 14,
 102     21, 29, 36, 44, 52, 60, 37, 45,
 103     53, 61, 22, 30,  7, 15, 23, 31,
 104     38, 46, 54, 62, 39, 47, 55, 63,
 105 };
 106
 107 /* Input permutation for the simple_idct_mmx */
 108 static const uint8_t simple_mmx_permutation[64]={
 109         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 110         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 111         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 112         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 113         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 114         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 115         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 116         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 117 };
 118
 119 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 120
 121 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 122     int i;
 123     int end;
 124
 125     st->scantable= src_scantable;
 126
 127     for(i=0; i<64; i++){
 128         int j;
 129         j = src_scantable[i];
 130         st->permutated[i] = permutation[j];
 131     }
 132
 133     end=-1;
 134     for(i=0; i<64; i++){
 135         int j;
 136         j = st->permutated[i];
 137         if(j>end) end=j;
 138         st->raster_end[i]= end;
 139     }
 140 }
 141
 142 void ff_init_scantable_permutation(uint8_t *idct_permutation,
 143                                    int idct_permutation_type)
 144 {
 145     int i;
 146
 147     switch(idct_permutation_type){
 148     case FF_NO_IDCT_PERM:
 149         for(i=0; i<64; i++)
 150             idct_permutation[i]= i;
 151         break;
 152     case FF_LIBMPEG2_IDCT_PERM:
 153         for(i=0; i<64; i++)
 154             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 155         break;
 156     case FF_SIMPLE_IDCT_PERM:
 157         for(i=0; i<64; i++)
 158             idct_permutation[i]= simple_mmx_permutation[i];
 159         break;
 160     case FF_TRANSPOSE_IDCT_PERM:
 161         for(i=0; i<64; i++)
 162             idct_permutation[i]= ((i&7)<<3) | (i>>3);
 163         break;
 164     case FF_PARTTRANS_IDCT_PERM:
 165         for(i=0; i<64; i++)
 166             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
 167         break;
 168     case FF_SSE2_IDCT_PERM:
 169         for(i=0; i<64; i++)
 170             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
 171         break;
 172     default:
 173         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
 174     }
 175 }
 176
 177 static int pix_sum_c(uint8_t * pix, int line_size)
 178 {
 179     int s, i, j;
 180
 181     s = 0;
 182     for (i = 0; i < 16; i++) {
 183         for (j = 0; j < 16; j += 8) {
 184             s += pix[0];
 185             s += pix[1];
 186             s += pix[2];
 187             s += pix[3];
 188             s += pix[4];
 189             s += pix[5];
 190             s += pix[6];
 191             s += pix[7];
 192             pix += 8;
 193         }
 194         pix += line_size - 16;
 195     }
 196     return s;
 197 }
 198
 199 static int pix_norm1_c(uint8_t * pix, int line_size)
 200 {
 201     int s, i, j;
 202     uint32_t *sq = ff_squareTbl + 256;
 203
 204     s = 0;
 205     for (i = 0; i < 16; i++) {
 206         for (j = 0; j < 16; j += 8) {
 207 #if 0
 208             s += sq[pix[0]];
 209             s += sq[pix[1]];
 210             s += sq[pix[2]];
 211             s += sq[pix[3]];
 212             s += sq[pix[4]];
 213             s += sq[pix[5]];
 214             s += sq[pix[6]];
 215             s += sq[pix[7]];
 216 #else
 217 #if HAVE_FAST_64BIT
 218             register uint64_t x=*(uint64_t*)pix;
 219             s += sq[x&0xff];
 220             s += sq[(x>>8)&0xff];
 221             s += sq[(x>>16)&0xff];
 222             s += sq[(x>>24)&0xff];
 223             s += sq[(x>>32)&0xff];
 224             s += sq[(x>>40)&0xff];
 225             s += sq[(x>>48)&0xff];
 226             s += sq[(x>>56)&0xff];
 227 #else
 228             register uint32_t x=*(uint32_t*)pix;
 229             s += sq[x&0xff];
 230             s += sq[(x>>8)&0xff];
 231             s += sq[(x>>16)&0xff];
 232             s += sq[(x>>24)&0xff];
 233             x=*(uint32_t*)(pix+4);
 234             s += sq[x&0xff];
 235             s += sq[(x>>8)&0xff];
 236             s += sq[(x>>16)&0xff];
 237             s += sq[(x>>24)&0xff];
 238 #endif
 239 #endif
 240             pix += 8;
 241         }
 242         pix += line_size - 16;
 243     }
 244     return s;
 245 }
 246
 247 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 248     int i;
 249
 250     for(i=0; i+8<=w; i+=8){
 251         dst[i+0]= av_bswap32(src[i+0]);
 252         dst[i+1]= av_bswap32(src[i+1]);
 253         dst[i+2]= av_bswap32(src[i+2]);
 254         dst[i+3]= av_bswap32(src[i+3]);
 255         dst[i+4]= av_bswap32(src[i+4]);
 256         dst[i+5]= av_bswap32(src[i+5]);
 257         dst[i+6]= av_bswap32(src[i+6]);
 258         dst[i+7]= av_bswap32(src[i+7]);
 259     }
 260     for(;i<w; i++){
 261         dst[i+0]= av_bswap32(src[i+0]);
 262     }
 263 }
 264
 265 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 266 {
 267     while (len--)
 268         *dst++ = av_bswap16(*src++);
 269 }
 270
 271 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 272 {
 273     int s, i;
 274     uint32_t *sq = ff_squareTbl + 256;
 275
 276     s = 0;
 277     for (i = 0; i < h; i++) {
 278         s += sq[pix1[0] - pix2[0]];
 279         s += sq[pix1[1] - pix2[1]];
 280         s += sq[pix1[2] - pix2[2]];
 281         s += sq[pix1[3] - pix2[3]];
 282         pix1 += line_size;
 283         pix2 += line_size;
 284     }
 285     return s;
 286 }
 287
 288 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 289 {
 290     int s, i;
 291     uint32_t *sq = ff_squareTbl + 256;
 292
 293     s = 0;
 294     for (i = 0; i < h; i++) {
 295         s += sq[pix1[0] - pix2[0]];
 296         s += sq[pix1[1] - pix2[1]];
 297         s += sq[pix1[2] - pix2[2]];
 298         s += sq[pix1[3] - pix2[3]];
 299         s += sq[pix1[4] - pix2[4]];
 300         s += sq[pix1[5] - pix2[5]];
 301         s += sq[pix1[6] - pix2[6]];
 302         s += sq[pix1[7] - pix2[7]];
 303         pix1 += line_size;
 304         pix2 += line_size;
 305     }
 306     return s;
 307 }
 308
 309 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 310 {
 311     int s, i;
 312     uint32_t *sq = ff_squareTbl + 256;
 313
 314     s = 0;
 315     for (i = 0; i < h; i++) {
 316         s += sq[pix1[ 0] - pix2[ 0]];
 317         s += sq[pix1[ 1] - pix2[ 1]];
 318         s += sq[pix1[ 2] - pix2[ 2]];
 319         s += sq[pix1[ 3] - pix2[ 3]];
 320         s += sq[pix1[ 4] - pix2[ 4]];
 321         s += sq[pix1[ 5] - pix2[ 5]];
 322         s += sq[pix1[ 6] - pix2[ 6]];
 323         s += sq[pix1[ 7] - pix2[ 7]];
 324         s += sq[pix1[ 8] - pix2[ 8]];
 325         s += sq[pix1[ 9] - pix2[ 9]];
 326         s += sq[pix1[10] - pix2[10]];
 327         s += sq[pix1[11] - pix2[11]];
 328         s += sq[pix1[12] - pix2[12]];
 329         s += sq[pix1[13] - pix2[13]];
 330         s += sq[pix1[14] - pix2[14]];
 331         s += sq[pix1[15] - pix2[15]];
 332
 333         pix1 += line_size;
 334         pix2 += line_size;
 335     }
 336     return s;
 337 }
 338
 339 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
 340                           const uint8_t *s2, int stride){
 341     int i;
 342
 343     /* read the pixels */
 344     for(i=0;i<8;i++) {
 345         block[0] = s1[0] - s2[0];
 346         block[1] = s1[1] - s2[1];
 347         block[2] = s1[2] - s2[2];
 348         block[3] = s1[3] - s2[3];
 349         block[4] = s1[4] - s2[4];
 350         block[5] = s1[5] - s2[5];
 351         block[6] = s1[6] - s2[6];
 352         block[7] = s1[7] - s2[7];
 353         s1 += stride;
 354         s2 += stride;
 355         block += 8;
 356     }
 357 }
 358
 359
 360 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 361                                  int line_size)
 362 {
 363     int i;
 364
 365     /* read the pixels */
 366     for(i=0;i<8;i++) {
 367         pixels[0] = av_clip_uint8(block[0]);
 368         pixels[1] = av_clip_uint8(block[1]);
 369         pixels[2] = av_clip_uint8(block[2]);
 370         pixels[3] = av_clip_uint8(block[3]);
 371         pixels[4] = av_clip_uint8(block[4]);
 372         pixels[5] = av_clip_uint8(block[5]);
 373         pixels[6] = av_clip_uint8(block[6]);
 374         pixels[7] = av_clip_uint8(block[7]);
 375
 376         pixels += line_size;
 377         block += 8;
 378     }
 379 }
 380
 381 static void put_signed_pixels_clamped_c(const int16_t *block,
 382                                         uint8_t *restrict pixels,
 383                                         int line_size)
 384 {
 385     int i, j;
 386
 387     for (i = 0; i < 8; i++) {
 388         for (j = 0; j < 8; j++) {
 389             if (*block < -128)
 390                 *pixels = 0;
 391             else if (*block > 127)
 392                 *pixels = 255;
 393             else
 394                 *pixels = (uint8_t)(*block + 128);
 395             block++;
 396             pixels++;
 397         }
 398         pixels += (line_size - 8);
 399     }
 400 }
 401
 402 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 403                                  int line_size)
 404 {
 405     int i;
 406
 407     /* read the pixels */
 408     for(i=0;i<8;i++) {
 409         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 410         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 411         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 412         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 413         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
 414         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
 415         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
 416         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
 417         pixels += line_size;
 418         block += 8;
 419     }
 420 }
 421
 422 static int sum_abs_dctelem_c(int16_t *block)
 423 {
 424     int sum=0, i;
 425     for(i=0; i<64; i++)
 426         sum+= FFABS(block[i]);
 427     return sum;
 428 }
 429
 430 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 431 {
 432     int i;
 433
 434     for (i = 0; i < h; i++) {
 435         memset(block, value, 16);
 436         block += line_size;
 437     }
 438 }
 439
 440 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 441 {
 442     int i;
 443
 444     for (i = 0; i < h; i++) {
 445         memset(block, value, 8);
 446         block += line_size;
 447     }
 448 }
 449
 450 #define avg2(a,b) ((a+b+1)>>1)
 451 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 452
 453 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 454 {
 455     const int A=(16-x16)*(16-y16);
 456     const int B=(   x16)*(16-y16);
 457     const int C=(16-x16)*(   y16);
 458     const int D=(   x16)*(   y16);
 459     int i;
 460
 461     for(i=0; i<h; i++)
 462     {
 463         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 464         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 465         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 466         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 467         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 468         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 469         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 470         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 471         dst+= stride;
 472         src+= stride;
 473     }
 474 }
 475
 476 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 477                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 478 {
 479     int y, vx, vy;
 480     const int s= 1<<shift;
 481
 482     width--;
 483     height--;
 484
 485     for(y=0; y<h; y++){
 486         int x;
 487
 488         vx= ox;
 489         vy= oy;
 490         for(x=0; x<8; x++){ //XXX FIXME optimize
 491             int src_x, src_y, frac_x, frac_y, index;
 492
 493             src_x= vx>>16;
 494             src_y= vy>>16;
 495             frac_x= src_x&(s-1);
 496             frac_y= src_y&(s-1);
 497             src_x>>=shift;
 498             src_y>>=shift;
 499
 500             if((unsigned)src_x < width){
 501                 if((unsigned)src_y < height){
 502                     index= src_x + src_y*stride;
 503                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 504                                            + src[index       +1]*   frac_x )*(s-frac_y)
 505                                         + (  src[index+stride  ]*(s-frac_x)
 506                                            + src[index+stride+1]*   frac_x )*   frac_y
 507                                         + r)>>(shift*2);
 508                 }else{
 509                     index= src_x + av_clip(src_y, 0, height)*stride;
 510                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 511                                           + src[index       +1]*   frac_x )*s
 512                                         + r)>>(shift*2);
 513                 }
 514             }else{
 515                 if((unsigned)src_y < height){
 516                     index= av_clip(src_x, 0, width) + src_y*stride;
 517                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 518                                            + src[index+stride  ]*   frac_y )*s
 519                                         + r)>>(shift*2);
 520                 }else{
 521                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 522                     dst[y*stride + x]=    src[index         ];
 523                 }
 524             }
 525
 526             vx+= dxx;
 527             vy+= dyx;
 528         }
 529         ox += dxy;
 530         oy += dyy;
 531     }
 532 }
 533
 534 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 535     switch(width){
 536     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 537     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 538     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 539     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 540     }
 541 }
 542
 543 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 544     int i,j;
 545     for (i=0; i < height; i++) {
 546       for (j=0; j < width; j++) {
 547         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 548       }
 549       src += stride;
 550       dst += stride;
 551     }
 552 }
 553
 554 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 555     int i,j;
 556     for (i=0; i < height; i++) {
 557       for (j=0; j < width; j++) {
 558         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 559       }
 560       src += stride;
 561       dst += stride;
 562     }
 563 }
 564
 565 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 566     int i,j;
 567     for (i=0; i < height; i++) {
 568       for (j=0; j < width; j++) {
 569         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 570       }
 571       src += stride;
 572       dst += stride;
 573     }
 574 }
 575
 576 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 577     int i,j;
 578     for (i=0; i < height; i++) {
 579       for (j=0; j < width; j++) {
 580         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 581       }
 582       src += stride;
 583       dst += stride;
 584     }
 585 }
 586
 587 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 588     int i,j;
 589     for (i=0; i < height; i++) {
 590       for (j=0; j < width; j++) {
 591         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 592       }
 593       src += stride;
 594       dst += stride;
 595     }
 596 }
 597
 598 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 599     int i,j;
 600     for (i=0; i < height; i++) {
 601       for (j=0; j < width; j++) {
 602         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 603       }
 604       src += stride;
 605       dst += stride;
 606     }
 607 }
 608
 609 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 610     int i,j;
 611     for (i=0; i < height; i++) {
 612       for (j=0; j < width; j++) {
 613         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 614       }
 615       src += stride;
 616       dst += stride;
 617     }
 618 }
 619
 620 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 621     int i,j;
 622     for (i=0; i < height; i++) {
 623       for (j=0; j < width; j++) {
 624         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 625       }
 626       src += stride;
 627       dst += stride;
 628     }
 629 }
 630
 631 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 632     switch(width){
 633     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 634     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 635     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 636     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 637     }
 638 }
 639
 640 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 641     int i,j;
 642     for (i=0; i < height; i++) {
 643       for (j=0; j < width; j++) {
 644         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 645       }
 646       src += stride;
 647       dst += stride;
 648     }
 649 }
 650
 651 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 652     int i,j;
 653     for (i=0; i < height; i++) {
 654       for (j=0; j < width; j++) {
 655         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 656       }
 657       src += stride;
 658       dst += stride;
 659     }
 660 }
 661
 662 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 663     int i,j;
 664     for (i=0; i < height; i++) {
 665       for (j=0; j < width; j++) {
 666         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 667       }
 668       src += stride;
 669       dst += stride;
 670     }
 671 }
 672
 673 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 674     int i,j;
 675     for (i=0; i < height; i++) {
 676       for (j=0; j < width; j++) {
 677         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 678       }
 679       src += stride;
 680       dst += stride;
 681     }
 682 }
 683
 684 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 685     int i,j;
 686     for (i=0; i < height; i++) {
 687       for (j=0; j < width; j++) {
 688         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 689       }
 690       src += stride;
 691       dst += stride;
 692     }
 693 }
 694
 695 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 696     int i,j;
 697     for (i=0; i < height; i++) {
 698       for (j=0; j < width; j++) {
 699         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 700       }
 701       src += stride;
 702       dst += stride;
 703     }
 704 }
 705
 706 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 707     int i,j;
 708     for (i=0; i < height; i++) {
 709       for (j=0; j < width; j++) {
 710         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 711       }
 712       src += stride;
 713       dst += stride;
 714     }
 715 }
 716
 717 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 718     int i,j;
 719     for (i=0; i < height; i++) {
 720       for (j=0; j < width; j++) {
 721         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 722       }
 723       src += stride;
 724       dst += stride;
 725     }
 726 }
 727
 728 #define QPEL_MC(r, OPNAME, RND, OP) \
 729 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 730     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 731     int i;\
 732     for(i=0; i<h; i++)\
 733     {\
 734         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 735         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 736         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 737         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 738         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 739         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 740         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 741         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 742         dst+=dstStride;\
 743         src+=srcStride;\
 744     }\
 745 }\
 746 \
 747 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 748     const int w=8;\
 749     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 750     int i;\
 751     for(i=0; i<w; i++)\
 752     {\
 753         const int src0= src[0*srcStride];\
 754         const int src1= src[1*srcStride];\
 755         const int src2= src[2*srcStride];\
 756         const int src3= src[3*srcStride];\
 757         const int src4= src[4*srcStride];\
 758         const int src5= src[5*srcStride];\
 759         const int src6= src[6*srcStride];\
 760         const int src7= src[7*srcStride];\
 761         const int src8= src[8*srcStride];\
 762         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 763         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 764         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 765         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 766         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 767         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 768         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 769         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 770         dst++;\
 771         src++;\
 772     }\
 773 }\
 774 \
 775 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 776     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 777     int i;\
 778     \
 779     for(i=0; i<h; i++)\
 780     {\
 781         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 782         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 783         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 784         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 785         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 786         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 787         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 788         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 789         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 790         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 791         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 792         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 793         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 794         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 795         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 796         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 797         dst+=dstStride;\
 798         src+=srcStride;\
 799     }\
 800 }\
 801 \
 802 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 803     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 804     int i;\
 805     const int w=16;\
 806     for(i=0; i<w; i++)\
 807     {\
 808         const int src0= src[0*srcStride];\
 809         const int src1= src[1*srcStride];\
 810         const int src2= src[2*srcStride];\
 811         const int src3= src[3*srcStride];\
 812         const int src4= src[4*srcStride];\
 813         const int src5= src[5*srcStride];\
 814         const int src6= src[6*srcStride];\
 815         const int src7= src[7*srcStride];\
 816         const int src8= src[8*srcStride];\
 817         const int src9= src[9*srcStride];\
 818         const int src10= src[10*srcStride];\
 819         const int src11= src[11*srcStride];\
 820         const int src12= src[12*srcStride];\
 821         const int src13= src[13*srcStride];\
 822         const int src14= src[14*srcStride];\
 823         const int src15= src[15*srcStride];\
 824         const int src16= src[16*srcStride];\
 825         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 826         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 827         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 828         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 829         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 830         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 831         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 832         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 833         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 834         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 835         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 836         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 837         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 838         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 839         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 840         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 841         dst++;\
 842         src++;\
 843     }\
 844 }\
 845 \
 846 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
 847     uint8_t half[64];\
 848     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 849     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 850 }\
 851 \
 852 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
 853     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 854 }\
 855 \
 856 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
 857     uint8_t half[64];\
 858     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 859     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 860 }\
 861 \
 862 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
 863     uint8_t full[16*9];\
 864     uint8_t half[64];\
 865     copy_block9(full, src, 16, stride, 9);\
 866     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 867     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 868 }\
 869 \
 870 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
 871     uint8_t full[16*9];\
 872     copy_block9(full, src, 16, stride, 9);\
 873     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 874 }\
 875 \
 876 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
 877     uint8_t full[16*9];\
 878     uint8_t half[64];\
 879     copy_block9(full, src, 16, stride, 9);\
 880     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 881     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 882 }\
 883 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
 884     uint8_t full[16*9];\
 885     uint8_t halfH[72];\
 886     uint8_t halfV[64];\
 887     uint8_t halfHV[64];\
 888     copy_block9(full, src, 16, stride, 9);\
 889     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 890     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 891     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 892     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 893 }\
 894 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
 895     uint8_t full[16*9];\
 896     uint8_t halfH[72];\
 897     uint8_t halfHV[64];\
 898     copy_block9(full, src, 16, stride, 9);\
 899     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 900     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 901     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 902     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 903 }\
 904 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
 905     uint8_t full[16*9];\
 906     uint8_t halfH[72];\
 907     uint8_t halfV[64];\
 908     uint8_t halfHV[64];\
 909     copy_block9(full, src, 16, stride, 9);\
 910     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 911     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 912     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 913     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 914 }\
 915 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
 916     uint8_t full[16*9];\
 917     uint8_t halfH[72];\
 918     uint8_t halfHV[64];\
 919     copy_block9(full, src, 16, stride, 9);\
 920     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 921     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 922     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 923     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 924 }\
 925 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
 926     uint8_t full[16*9];\
 927     uint8_t halfH[72];\
 928     uint8_t halfV[64];\
 929     uint8_t halfHV[64];\
 930     copy_block9(full, src, 16, stride, 9);\
 931     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 932     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 933     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 934     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 935 }\
 936 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
 937     uint8_t full[16*9];\
 938     uint8_t halfH[72];\
 939     uint8_t halfHV[64];\
 940     copy_block9(full, src, 16, stride, 9);\
 941     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 942     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 943     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 944     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 945 }\
 946 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
 947     uint8_t full[16*9];\
 948     uint8_t halfH[72];\
 949     uint8_t halfV[64];\
 950     uint8_t halfHV[64];\
 951     copy_block9(full, src, 16, stride, 9);\
 952     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
 953     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 954     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 955     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 956 }\
 957 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
 958     uint8_t full[16*9];\
 959     uint8_t halfH[72];\
 960     uint8_t halfHV[64];\
 961     copy_block9(full, src, 16, stride, 9);\
 962     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 963     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 964     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 965     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 966 }\
 967 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
 968     uint8_t halfH[72];\
 969     uint8_t halfHV[64];\
 970     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
 971     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 972     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 973 }\
 974 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
 975     uint8_t halfH[72];\
 976     uint8_t halfHV[64];\
 977     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
 978     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 979     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 980 }\
 981 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
 982     uint8_t full[16*9];\
 983     uint8_t halfH[72];\
 984     uint8_t halfV[64];\
 985     uint8_t halfHV[64];\
 986     copy_block9(full, src, 16, stride, 9);\
 987     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 988     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 989     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 990     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
 991 }\
 992 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
 993     uint8_t full[16*9];\
 994     uint8_t halfH[72];\
 995     copy_block9(full, src, 16, stride, 9);\
 996     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 997     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 998     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
 999 }\
1000 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1001     uint8_t full[16*9];\
1002     uint8_t halfH[72];\
1003     uint8_t halfV[64];\
1004     uint8_t halfHV[64];\
1005     copy_block9(full, src, 16, stride, 9);\
1006     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1007     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1008     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1009     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1010 }\
1011 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1012     uint8_t full[16*9];\
1013     uint8_t halfH[72];\
1014     copy_block9(full, src, 16, stride, 9);\
1015     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1017     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1018 }\
1019 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1020     uint8_t halfH[72];\
1021     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1022     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1023 }\
1024 \
1025 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1026     uint8_t half[256];\
1027     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1028     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1029 }\
1030 \
1031 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1032     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1033 }\
1034 \
1035 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1036     uint8_t half[256];\
1037     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1038     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1039 }\
1040 \
1041 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1042     uint8_t full[24*17];\
1043     uint8_t half[256];\
1044     copy_block17(full, src, 24, stride, 17);\
1045     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1046     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1047 }\
1048 \
1049 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1050     uint8_t full[24*17];\
1051     copy_block17(full, src, 24, stride, 17);\
1052     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1053 }\
1054 \
1055 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1056     uint8_t full[24*17];\
1057     uint8_t half[256];\
1058     copy_block17(full, src, 24, stride, 17);\
1059     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1060     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1061 }\
1062 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1063     uint8_t full[24*17];\
1064     uint8_t halfH[272];\
1065     uint8_t halfV[256];\
1066     uint8_t halfHV[256];\
1067     copy_block17(full, src, 24, stride, 17);\
1068     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1069     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1070     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1071     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1072 }\
1073 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1074     uint8_t full[24*17];\
1075     uint8_t halfH[272];\
1076     uint8_t halfHV[256];\
1077     copy_block17(full, src, 24, stride, 17);\
1078     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1079     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1080     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1081     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1082 }\
1083 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1084     uint8_t full[24*17];\
1085     uint8_t halfH[272];\
1086     uint8_t halfV[256];\
1087     uint8_t halfHV[256];\
1088     copy_block17(full, src, 24, stride, 17);\
1089     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1090     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1091     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1092     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1093 }\
1094 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1095     uint8_t full[24*17];\
1096     uint8_t halfH[272];\
1097     uint8_t halfHV[256];\
1098     copy_block17(full, src, 24, stride, 17);\
1099     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1100     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1101     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1102     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1103 }\
1104 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1105     uint8_t full[24*17];\
1106     uint8_t halfH[272];\
1107     uint8_t halfV[256];\
1108     uint8_t halfHV[256];\
1109     copy_block17(full, src, 24, stride, 17);\
1110     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1111     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1112     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1113     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1114 }\
1115 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1116     uint8_t full[24*17];\
1117     uint8_t halfH[272];\
1118     uint8_t halfHV[256];\
1119     copy_block17(full, src, 24, stride, 17);\
1120     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1121     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1122     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1123     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1124 }\
1125 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1126     uint8_t full[24*17];\
1127     uint8_t halfH[272];\
1128     uint8_t halfV[256];\
1129     uint8_t halfHV[256];\
1130     copy_block17(full, src, 24, stride, 17);\
1131     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1132     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1133     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1134     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1135 }\
1136 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1137     uint8_t full[24*17];\
1138     uint8_t halfH[272];\
1139     uint8_t halfHV[256];\
1140     copy_block17(full, src, 24, stride, 17);\
1141     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1142     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1143     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1144     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1145 }\
1146 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1147     uint8_t halfH[272];\
1148     uint8_t halfHV[256];\
1149     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1150     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1151     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1152 }\
1153 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1154     uint8_t halfH[272];\
1155     uint8_t halfHV[256];\
1156     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1157     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1158     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1159 }\
1160 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1161     uint8_t full[24*17];\
1162     uint8_t halfH[272];\
1163     uint8_t halfV[256];\
1164     uint8_t halfHV[256];\
1165     copy_block17(full, src, 24, stride, 17);\
1166     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1167     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1168     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1169     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1170 }\
1171 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1172     uint8_t full[24*17];\
1173     uint8_t halfH[272];\
1174     copy_block17(full, src, 24, stride, 17);\
1175     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1176     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1177     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1178 }\
1179 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1180     uint8_t full[24*17];\
1181     uint8_t halfH[272];\
1182     uint8_t halfV[256];\
1183     uint8_t halfHV[256];\
1184     copy_block17(full, src, 24, stride, 17);\
1185     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1186     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1187     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1188     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1189 }\
1190 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1191     uint8_t full[24*17];\
1192     uint8_t halfH[272];\
1193     copy_block17(full, src, 24, stride, 17);\
1194     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1195     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1196     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1197 }\
1198 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1199     uint8_t halfH[272];\
1200     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1201     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1202 }
1203
1204 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1205 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1206 #define op_put(a, b) a = cm[((b) + 16)>>5]
1207 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1208
1209 QPEL_MC(0, put_       , _       , op_put)
1210 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1211 QPEL_MC(0, avg_       , _       , op_avg)
1212 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1213 #undef op_avg
1214 #undef op_avg_no_rnd
1215 #undef op_put
1216 #undef op_put_no_rnd
1217
1218 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1219 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1220 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1221 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1222 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1223 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1224
1225 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1226     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1227     int i;
1228
1229     for(i=0; i<h; i++){
1230         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1231         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1232         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1233         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1234         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1235         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1236         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1237         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1238         dst+=dstStride;
1239         src+=srcStride;
1240     }
1241 }
1242
1243 #if CONFIG_RV40_DECODER
1244 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1245     put_pixels16_xy2_8_c(dst, src, stride, 16);
1246 }
1247 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1248     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1249 }
1250 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1251     put_pixels8_xy2_8_c(dst, src, stride, 8);
1252 }
1253 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1254     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1255 }
1256 #endif /* CONFIG_RV40_DECODER */
1257
1258 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1259     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1260     int i;
1261
1262     for(i=0; i<w; i++){
1263         const int src_1= src[ -srcStride];
1264         const int src0 = src[0          ];
1265         const int src1 = src[  srcStride];
1266         const int src2 = src[2*srcStride];
1267         const int src3 = src[3*srcStride];
1268         const int src4 = src[4*srcStride];
1269         const int src5 = src[5*srcStride];
1270         const int src6 = src[6*srcStride];
1271         const int src7 = src[7*srcStride];
1272         const int src8 = src[8*srcStride];
1273         const int src9 = src[9*srcStride];
1274         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1275         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1276         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1277         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1278         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1279         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1280         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1281         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1282         src++;
1283         dst++;
1284     }
1285 }
1286
1287 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1288     uint8_t half[64];
1289     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1290     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1291 }
1292
1293 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1294     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1295 }
1296
1297 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1298     uint8_t half[64];
1299     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1300     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1301 }
1302
1303 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1304     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1305 }
1306
1307 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1308     uint8_t halfH[88];
1309     uint8_t halfV[64];
1310     uint8_t halfHV[64];
1311     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1312     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1313     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1314     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1315 }
1316 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1317     uint8_t halfH[88];
1318     uint8_t halfV[64];
1319     uint8_t halfHV[64];
1320     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1321     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1322     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1323     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1324 }
1325 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1326     uint8_t halfH[88];
1327     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1328     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1329 }
1330
1331 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1332     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1333     int x;
1334     const int strength= ff_h263_loop_filter_strength[qscale];
1335
1336     for(x=0; x<8; x++){
1337         int d1, d2, ad1;
1338         int p0= src[x-2*stride];
1339         int p1= src[x-1*stride];
1340         int p2= src[x+0*stride];
1341         int p3= src[x+1*stride];
1342         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1343
1344         if     (d<-2*strength) d1= 0;
1345         else if(d<-  strength) d1=-2*strength - d;
1346         else if(d<   strength) d1= d;
1347         else if(d< 2*strength) d1= 2*strength - d;
1348         else                   d1= 0;
1349
1350         p1 += d1;
1351         p2 -= d1;
1352         if(p1&256) p1= ~(p1>>31);
1353         if(p2&256) p2= ~(p2>>31);
1354
1355         src[x-1*stride] = p1;
1356         src[x+0*stride] = p2;
1357
1358         ad1= FFABS(d1)>>1;
1359
1360         d2= av_clip((p0-p3)/4, -ad1, ad1);
1361
1362         src[x-2*stride] = p0 - d2;
1363         src[x+  stride] = p3 + d2;
1364     }
1365     }
1366 }
1367
1368 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1369     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1370     int y;
1371     const int strength= ff_h263_loop_filter_strength[qscale];
1372
1373     for(y=0; y<8; y++){
1374         int d1, d2, ad1;
1375         int p0= src[y*stride-2];
1376         int p1= src[y*stride-1];
1377         int p2= src[y*stride+0];
1378         int p3= src[y*stride+1];
1379         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1380
1381         if     (d<-2*strength) d1= 0;
1382         else if(d<-  strength) d1=-2*strength - d;
1383         else if(d<   strength) d1= d;
1384         else if(d< 2*strength) d1= 2*strength - d;
1385         else                   d1= 0;
1386
1387         p1 += d1;
1388         p2 -= d1;
1389         if(p1&256) p1= ~(p1>>31);
1390         if(p2&256) p2= ~(p2>>31);
1391
1392         src[y*stride-1] = p1;
1393         src[y*stride+0] = p2;
1394
1395         ad1= FFABS(d1)>>1;
1396
1397         d2= av_clip((p0-p3)/4, -ad1, ad1);
1398
1399         src[y*stride-2] = p0 - d2;
1400         src[y*stride+1] = p3 + d2;
1401     }
1402     }
1403 }
1404
1405 static void h261_loop_filter_c(uint8_t *src, int stride){
1406     int x,y,xy,yz;
1407     int temp[64];
1408
1409     for(x=0; x<8; x++){
1410         temp[x      ] = 4*src[x           ];
1411         temp[x + 7*8] = 4*src[x + 7*stride];
1412     }
1413     for(y=1; y<7; y++){
1414         for(x=0; x<8; x++){
1415             xy = y * stride + x;
1416             yz = y * 8 + x;
1417             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1418         }
1419     }
1420
1421     for(y=0; y<8; y++){
1422         src[  y*stride] = (temp[  y*8] + 2)>>2;
1423         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1424         for(x=1; x<7; x++){
1425             xy = y * stride + x;
1426             yz = y * 8 + x;
1427             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1428         }
1429     }
1430 }
1431
1432 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1433 {
1434     int s, i;
1435
1436     s = 0;
1437     for(i=0;i<h;i++) {
1438         s += abs(pix1[0] - pix2[0]);
1439         s += abs(pix1[1] - pix2[1]);
1440         s += abs(pix1[2] - pix2[2]);
1441         s += abs(pix1[3] - pix2[3]);
1442         s += abs(pix1[4] - pix2[4]);
1443         s += abs(pix1[5] - pix2[5]);
1444         s += abs(pix1[6] - pix2[6]);
1445         s += abs(pix1[7] - pix2[7]);
1446         s += abs(pix1[8] - pix2[8]);
1447         s += abs(pix1[9] - pix2[9]);
1448         s += abs(pix1[10] - pix2[10]);
1449         s += abs(pix1[11] - pix2[11]);
1450         s += abs(pix1[12] - pix2[12]);
1451         s += abs(pix1[13] - pix2[13]);
1452         s += abs(pix1[14] - pix2[14]);
1453         s += abs(pix1[15] - pix2[15]);
1454         pix1 += line_size;
1455         pix2 += line_size;
1456     }
1457     return s;
1458 }
1459
1460 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1461 {
1462     int s, i;
1463
1464     s = 0;
1465     for(i=0;i<h;i++) {
1466         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1467         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1468         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1469         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1470         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1471         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1472         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1473         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1474         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1475         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1476         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1477         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1478         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1479         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1480         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1481         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1482         pix1 += line_size;
1483         pix2 += line_size;
1484     }
1485     return s;
1486 }
1487
1488 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1489 {
1490     int s, i;
1491     uint8_t *pix3 = pix2 + line_size;
1492
1493     s = 0;
1494     for(i=0;i<h;i++) {
1495         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1496         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1497         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1498         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1499         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1500         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1501         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1502         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1503         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1504         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1505         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1506         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1507         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1508         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1509         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1510         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1511         pix1 += line_size;
1512         pix2 += line_size;
1513         pix3 += line_size;
1514     }
1515     return s;
1516 }
1517
1518 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1519 {
1520     int s, i;
1521     uint8_t *pix3 = pix2 + line_size;
1522
1523     s = 0;
1524     for(i=0;i<h;i++) {
1525         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1526         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1527         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1528         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1529         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1530         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1531         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1532         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1533         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1534         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1535         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1536         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1537         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1538         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1539         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1540         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1541         pix1 += line_size;
1542         pix2 += line_size;
1543         pix3 += line_size;
1544     }
1545     return s;
1546 }
1547
1548 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1549 {
1550     int s, i;
1551
1552     s = 0;
1553     for(i=0;i<h;i++) {
1554         s += abs(pix1[0] - pix2[0]);
1555         s += abs(pix1[1] - pix2[1]);
1556         s += abs(pix1[2] - pix2[2]);
1557         s += abs(pix1[3] - pix2[3]);
1558         s += abs(pix1[4] - pix2[4]);
1559         s += abs(pix1[5] - pix2[5]);
1560         s += abs(pix1[6] - pix2[6]);
1561         s += abs(pix1[7] - pix2[7]);
1562         pix1 += line_size;
1563         pix2 += line_size;
1564     }
1565     return s;
1566 }
1567
1568 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1569 {
1570     int s, i;
1571
1572     s = 0;
1573     for(i=0;i<h;i++) {
1574         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1575         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1576         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1577         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1578         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1579         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1580         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1581         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1582         pix1 += line_size;
1583         pix2 += line_size;
1584     }
1585     return s;
1586 }
1587
1588 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1589 {
1590     int s, i;
1591     uint8_t *pix3 = pix2 + line_size;
1592
1593     s = 0;
1594     for(i=0;i<h;i++) {
1595         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1596         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1597         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1598         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1599         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1600         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1601         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1602         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1603         pix1 += line_size;
1604         pix2 += line_size;
1605         pix3 += line_size;
1606     }
1607     return s;
1608 }
1609
1610 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1611 {
1612     int s, i;
1613     uint8_t *pix3 = pix2 + line_size;
1614
1615     s = 0;
1616     for(i=0;i<h;i++) {
1617         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1618         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1619         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1620         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1621         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1622         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1623         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1624         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1625         pix1 += line_size;
1626         pix2 += line_size;
1627         pix3 += line_size;
1628     }
1629     return s;
1630 }
1631
1632 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1633     MpegEncContext *c = v;
1634     int score1=0;
1635     int score2=0;
1636     int x,y;
1637
1638     for(y=0; y<h; y++){
1639         for(x=0; x<16; x++){
1640             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1641         }
1642         if(y+1<h){
1643             for(x=0; x<15; x++){
1644                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1645                              - s1[x+1] + s1[x+1+stride])
1646                         -FFABS(  s2[x  ] - s2[x  +stride]
1647                              - s2[x+1] + s2[x+1+stride]);
1648             }
1649         }
1650         s1+= stride;
1651         s2+= stride;
1652     }
1653
1654     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1655     else  return score1 + FFABS(score2)*8;
1656 }
1657
1658 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1659     MpegEncContext *c = v;
1660     int score1=0;
1661     int score2=0;
1662     int x,y;
1663
1664     for(y=0; y<h; y++){
1665         for(x=0; x<8; x++){
1666             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1667         }
1668         if(y+1<h){
1669             for(x=0; x<7; x++){
1670                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1671                              - s1[x+1] + s1[x+1+stride])
1672                         -FFABS(  s2[x  ] - s2[x  +stride]
1673                              - s2[x+1] + s2[x+1+stride]);
1674             }
1675         }
1676         s1+= stride;
1677         s2+= stride;
1678     }
1679
1680     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1681     else  return score1 + FFABS(score2)*8;
1682 }
1683
1684 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1685     int i;
1686     unsigned int sum=0;
1687
1688     for(i=0; i<8*8; i++){
1689         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1690         int w= weight[i];
1691         b>>= RECON_SHIFT;
1692         assert(-512<b && b<512);
1693
1694         sum += (w*b)*(w*b)>>4;
1695     }
1696     return sum>>2;
1697 }
1698
1699 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1700     int i;
1701
1702     for(i=0; i<8*8; i++){
1703         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1704     }
1705 }
1706
1707 /**
1708  * Permute an 8x8 block.
1709  * @param block the block which will be permuted according to the given permutation vector
1710  * @param permutation the permutation vector
1711  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1712  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1713  *                  (inverse) permutated to scantable order!
1714  */
1715 void ff_block_permute(int16_t *block, uint8_t *permutation, const uint8_t *scantable, int last)
1716 {
1717     int i;
1718     int16_t temp[64];
1719
1720     if(last<=0) return;
1721     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1722
1723     for(i=0; i<=last; i++){
1724         const int j= scantable[i];
1725         temp[j]= block[j];
1726         block[j]=0;
1727     }
1728
1729     for(i=0; i<=last; i++){
1730         const int j= scantable[i];
1731         const int perm_j= permutation[j];
1732         block[perm_j]= temp[j];
1733     }
1734 }
1735
1736 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1737     return 0;
1738 }
1739
1740 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1741     int i;
1742
1743     memset(cmp, 0, sizeof(void*)*6);
1744
1745     for(i=0; i<6; i++){
1746         switch(type&0xFF){
1747         case FF_CMP_SAD:
1748             cmp[i]= c->sad[i];
1749             break;
1750         case FF_CMP_SATD:
1751             cmp[i]= c->hadamard8_diff[i];
1752             break;
1753         case FF_CMP_SSE:
1754             cmp[i]= c->sse[i];
1755             break;
1756         case FF_CMP_DCT:
1757             cmp[i]= c->dct_sad[i];
1758             break;
1759         case FF_CMP_DCT264:
1760             cmp[i]= c->dct264_sad[i];
1761             break;
1762         case FF_CMP_DCTMAX:
1763             cmp[i]= c->dct_max[i];
1764             break;
1765         case FF_CMP_PSNR:
1766             cmp[i]= c->quant_psnr[i];
1767             break;
1768         case FF_CMP_BIT:
1769             cmp[i]= c->bit[i];
1770             break;
1771         case FF_CMP_RD:
1772             cmp[i]= c->rd[i];
1773             break;
1774         case FF_CMP_VSAD:
1775             cmp[i]= c->vsad[i];
1776             break;
1777         case FF_CMP_VSSE:
1778             cmp[i]= c->vsse[i];
1779             break;
1780         case FF_CMP_ZERO:
1781             cmp[i]= zero_cmp;
1782             break;
1783         case FF_CMP_NSSE:
1784             cmp[i]= c->nsse[i];
1785             break;
1786         default:
1787             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1788         }
1789     }
1790 }
1791
1792 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1793     long i;
1794     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1795         long a = *(long*)(src+i);
1796         long b = *(long*)(dst+i);
1797         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1798     }
1799     for(; i<w; i++)
1800         dst[i+0] += src[i+0];
1801 }
1802
1803 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1804     long i;
1805 #if !HAVE_FAST_UNALIGNED
1806     if((long)src2 & (sizeof(long)-1)){
1807         for(i=0; i+7<w; i+=8){
1808             dst[i+0] = src1[i+0]-src2[i+0];
1809             dst[i+1] = src1[i+1]-src2[i+1];
1810             dst[i+2] = src1[i+2]-src2[i+2];
1811             dst[i+3] = src1[i+3]-src2[i+3];
1812             dst[i+4] = src1[i+4]-src2[i+4];
1813             dst[i+5] = src1[i+5]-src2[i+5];
1814             dst[i+6] = src1[i+6]-src2[i+6];
1815             dst[i+7] = src1[i+7]-src2[i+7];
1816         }
1817     }else
1818 #endif
1819     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1820         long a = *(long*)(src1+i);
1821         long b = *(long*)(src2+i);
1822         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1823     }
1824     for(; i<w; i++)
1825         dst[i+0] = src1[i+0]-src2[i+0];
1826 }
1827
1828 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1829     int i;
1830     uint8_t l, lt;
1831
1832     l= *left;
1833     lt= *left_top;
1834
1835     for(i=0; i<w; i++){
1836         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1837         lt= src1[i];
1838         dst[i]= l;
1839     }
1840
1841     *left= l;
1842     *left_top= lt;
1843 }
1844
1845 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1846     int i;
1847     uint8_t l, lt;
1848
1849     l= *left;
1850     lt= *left_top;
1851
1852     for(i=0; i<w; i++){
1853         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1854         lt= src1[i];
1855         l= src2[i];
1856         dst[i]= l - pred;
1857     }
1858
1859     *left= l;
1860     *left_top= lt;
1861 }
1862
1863 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1864     int i;
1865
1866     for(i=0; i<w-1; i++){
1867         acc+= src[i];
1868         dst[i]= acc;
1869         i++;
1870         acc+= src[i];
1871         dst[i]= acc;
1872     }
1873
1874     for(; i<w; i++){
1875         acc+= src[i];
1876         dst[i]= acc;
1877     }
1878
1879     return acc;
1880 }
1881
1882 #if HAVE_BIGENDIAN
1883 #define B 3
1884 #define G 2
1885 #define R 1
1886 #define A 0
1887 #else
1888 #define B 0
1889 #define G 1
1890 #define R 2
1891 #define A 3
1892 #endif
1893 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1894     int i;
1895     int r,g,b,a;
1896     r= *red;
1897     g= *green;
1898     b= *blue;
1899     a= *alpha;
1900
1901     for(i=0; i<w; i++){
1902         b+= src[4*i+B];
1903         g+= src[4*i+G];
1904         r+= src[4*i+R];
1905         a+= src[4*i+A];
1906
1907         dst[4*i+B]= b;
1908         dst[4*i+G]= g;
1909         dst[4*i+R]= r;
1910         dst[4*i+A]= a;
1911     }
1912
1913     *red= r;
1914     *green= g;
1915     *blue= b;
1916     *alpha= a;
1917 }
1918 #undef B
1919 #undef G
1920 #undef R
1921 #undef A
1922
1923 #define BUTTERFLY2(o1,o2,i1,i2) \
1924 o1= (i1)+(i2);\
1925 o2= (i1)-(i2);
1926
1927 #define BUTTERFLY1(x,y) \
1928 {\
1929     int a,b;\
1930     a= x;\
1931     b= y;\
1932     x= a+b;\
1933     y= a-b;\
1934 }
1935
1936 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1937
1938 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1939     int i;
1940     int temp[64];
1941     int sum=0;
1942
1943     assert(h==8);
1944
1945     for(i=0; i<8; i++){
1946         //FIXME try pointer walks
1947         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1948         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1949         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1950         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1951
1952         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1953         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1954         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1955         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1956
1957         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1958         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1959         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1960         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1961     }
1962
1963     for(i=0; i<8; i++){
1964         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1965         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1966         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1967         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1968
1969         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1970         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1971         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1972         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1973
1974         sum +=
1975              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1976             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1977             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1978             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1979     }
1980     return sum;
1981 }
1982
1983 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1984     int i;
1985     int temp[64];
1986     int sum=0;
1987
1988     assert(h==8);
1989
1990     for(i=0; i<8; i++){
1991         //FIXME try pointer walks
1992         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
1993         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
1994         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
1995         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
1996
1997         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1998         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1999         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2000         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2001
2002         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2003         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2004         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2005         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2006     }
2007
2008     for(i=0; i<8; i++){
2009         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2010         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2011         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2012         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2013
2014         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2015         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2016         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2017         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2018
2019         sum +=
2020              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2021             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2022             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2023             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2024     }
2025
2026     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2027
2028     return sum;
2029 }
2030
2031 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2032     MpegEncContext * const s= (MpegEncContext *)c;
2033     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2034
2035     assert(h==8);
2036
2037     s->dsp.diff_pixels(temp, src1, src2, stride);
2038     s->dsp.fdct(temp);
2039     return s->dsp.sum_abs_dctelem(temp);
2040 }
2041
2042 #if CONFIG_GPL
2043 #define DCT8_1D {\
2044     const int s07 = SRC(0) + SRC(7);\
2045     const int s16 = SRC(1) + SRC(6);\
2046     const int s25 = SRC(2) + SRC(5);\
2047     const int s34 = SRC(3) + SRC(4);\
2048     const int a0 = s07 + s34;\
2049     const int a1 = s16 + s25;\
2050     const int a2 = s07 - s34;\
2051     const int a3 = s16 - s25;\
2052     const int d07 = SRC(0) - SRC(7);\
2053     const int d16 = SRC(1) - SRC(6);\
2054     const int d25 = SRC(2) - SRC(5);\
2055     const int d34 = SRC(3) - SRC(4);\
2056     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2057     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2058     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2059     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2060     DST(0,  a0 + a1     ) ;\
2061     DST(1,  a4 + (a7>>2)) ;\
2062     DST(2,  a2 + (a3>>1)) ;\
2063     DST(3,  a5 + (a6>>2)) ;\
2064     DST(4,  a0 - a1     ) ;\
2065     DST(5,  a6 - (a5>>2)) ;\
2066     DST(6, (a2>>1) - a3 ) ;\
2067     DST(7, (a4>>2) - a7 ) ;\
2068 }
2069
2070 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2071     MpegEncContext * const s= (MpegEncContext *)c;
2072     int16_t dct[8][8];
2073     int i;
2074     int sum=0;
2075
2076     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2077
2078 #define SRC(x) dct[i][x]
2079 #define DST(x,v) dct[i][x]= v
2080     for( i = 0; i < 8; i++ )
2081         DCT8_1D
2082 #undef SRC
2083 #undef DST
2084
2085 #define SRC(x) dct[x][i]
2086 #define DST(x,v) sum += FFABS(v)
2087     for( i = 0; i < 8; i++ )
2088         DCT8_1D
2089 #undef SRC
2090 #undef DST
2091     return sum;
2092 }
2093 #endif
2094
2095 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2096     MpegEncContext * const s= (MpegEncContext *)c;
2097     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2098     int sum=0, i;
2099
2100     assert(h==8);
2101
2102     s->dsp.diff_pixels(temp, src1, src2, stride);
2103     s->dsp.fdct(temp);
2104
2105     for(i=0; i<64; i++)
2106         sum= FFMAX(sum, FFABS(temp[i]));
2107
2108     return sum;
2109 }
2110
2111 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2112     MpegEncContext * const s= (MpegEncContext *)c;
2113     LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2114     int16_t * const bak = temp+64;
2115     int sum=0, i;
2116
2117     assert(h==8);
2118     s->mb_intra=0;
2119
2120     s->dsp.diff_pixels(temp, src1, src2, stride);
2121
2122     memcpy(bak, temp, 64*sizeof(int16_t));
2123
2124     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2125     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2126     ff_simple_idct_8(temp); //FIXME
2127
2128     for(i=0; i<64; i++)
2129         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2130
2131     return sum;
2132 }
2133
2134 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2135     MpegEncContext * const s= (MpegEncContext *)c;
2136     const uint8_t *scantable= s->intra_scantable.permutated;
2137     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2138     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2139     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2140     int i, last, run, bits, level, distortion, start_i;
2141     const int esc_length= s->ac_esc_length;
2142     uint8_t * length;
2143     uint8_t * last_length;
2144
2145     assert(h==8);
2146
2147     copy_block8(lsrc1, src1, 8, stride, 8);
2148     copy_block8(lsrc2, src2, 8, stride, 8);
2149
2150     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2151
2152     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2153
2154     bits=0;
2155
2156     if (s->mb_intra) {
2157         start_i = 1;
2158         length     = s->intra_ac_vlc_length;
2159         last_length= s->intra_ac_vlc_last_length;
2160         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2161     } else {
2162         start_i = 0;
2163         length     = s->inter_ac_vlc_length;
2164         last_length= s->inter_ac_vlc_last_length;
2165     }
2166
2167     if(last>=start_i){
2168         run=0;
2169         for(i=start_i; i<last; i++){
2170             int j= scantable[i];
2171             level= temp[j];
2172
2173             if(level){
2174                 level+=64;
2175                 if((level&(~127)) == 0){
2176                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2177                 }else
2178                     bits+= esc_length;
2179                 run=0;
2180             }else
2181                 run++;
2182         }
2183         i= scantable[last];
2184
2185         level= temp[i] + 64;
2186
2187         assert(level - 64);
2188
2189         if((level&(~127)) == 0){
2190             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2191         }else
2192             bits+= esc_length;
2193
2194     }
2195
2196     if(last>=0){
2197         if(s->mb_intra)
2198             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2199         else
2200             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2201     }
2202
2203     s->dsp.idct_add(lsrc2, 8, temp);
2204
2205     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2206
2207     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2208 }
2209
2210 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2211     MpegEncContext * const s= (MpegEncContext *)c;
2212     const uint8_t *scantable= s->intra_scantable.permutated;
2213     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2214     int i, last, run, bits, level, start_i;
2215     const int esc_length= s->ac_esc_length;
2216     uint8_t * length;
2217     uint8_t * last_length;
2218
2219     assert(h==8);
2220
2221     s->dsp.diff_pixels(temp, src1, src2, stride);
2222
2223     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2224
2225     bits=0;
2226
2227     if (s->mb_intra) {
2228         start_i = 1;
2229         length     = s->intra_ac_vlc_length;
2230         last_length= s->intra_ac_vlc_last_length;
2231         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2232     } else {
2233         start_i = 0;
2234         length     = s->inter_ac_vlc_length;
2235         last_length= s->inter_ac_vlc_last_length;
2236     }
2237
2238     if(last>=start_i){
2239         run=0;
2240         for(i=start_i; i<last; i++){
2241             int j= scantable[i];
2242             level= temp[j];
2243
2244             if(level){
2245                 level+=64;
2246                 if((level&(~127)) == 0){
2247                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2248                 }else
2249                     bits+= esc_length;
2250                 run=0;
2251             }else
2252                 run++;
2253         }
2254         i= scantable[last];
2255
2256         level= temp[i] + 64;
2257
2258         assert(level - 64);
2259
2260         if((level&(~127)) == 0){
2261             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2262         }else
2263             bits+= esc_length;
2264     }
2265
2266     return bits;
2267 }
2268
2269 #define VSAD_INTRA(size) \
2270 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2271     int score=0;                                                                                            \
2272     int x,y;                                                                                                \
2273                                                                                                             \
2274     for(y=1; y<h; y++){                                                                                     \
2275         for(x=0; x<size; x+=4){                                                                             \
2276             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2277                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2278         }                                                                                                   \
2279         s+= stride;                                                                                         \
2280     }                                                                                                       \
2281                                                                                                             \
2282     return score;                                                                                           \
2283 }
2284 VSAD_INTRA(8)
2285 VSAD_INTRA(16)
2286
2287 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2288     int score=0;
2289     int x,y;
2290
2291     for(y=1; y<h; y++){
2292         for(x=0; x<16; x++){
2293             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2294         }
2295         s1+= stride;
2296         s2+= stride;
2297     }
2298
2299     return score;
2300 }
2301
2302 #define SQ(a) ((a)*(a))
2303 #define VSSE_INTRA(size) \
2304 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2305     int score=0;                                                                                            \
2306     int x,y;                                                                                                \
2307                                                                                                             \
2308     for(y=1; y<h; y++){                                                                                     \
2309         for(x=0; x<size; x+=4){                                                                               \
2310             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2311                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2312         }                                                                                                   \
2313         s+= stride;                                                                                         \
2314     }                                                                                                       \
2315                                                                                                             \
2316     return score;                                                                                           \
2317 }
2318 VSSE_INTRA(8)
2319 VSSE_INTRA(16)
2320
2321 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2322     int score=0;
2323     int x,y;
2324
2325     for(y=1; y<h; y++){
2326         for(x=0; x<16; x++){
2327             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2328         }
2329         s1+= stride;
2330         s2+= stride;
2331     }
2332
2333     return score;
2334 }
2335
2336 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2337                                int size){
2338     int score=0;
2339     int i;
2340     for(i=0; i<size; i++)
2341         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2342     return score;
2343 }
2344
2345 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2346 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2347 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2348 #if CONFIG_GPL
2349 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2350 #endif
2351 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2352 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2353 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2354 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2355
2356 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2357                    uint32_t maxi, uint32_t maxisign)
2358 {
2359
2360     if(a > mini) return mini;
2361     else if((a^(1U<<31)) > maxisign) return maxi;
2362     else return a;
2363 }
2364
2365 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2366     int i;
2367     uint32_t mini = *(uint32_t*)min;
2368     uint32_t maxi = *(uint32_t*)max;
2369     uint32_t maxisign = maxi ^ (1U<<31);
2370     uint32_t *dsti = (uint32_t*)dst;
2371     const uint32_t *srci = (const uint32_t*)src;
2372     for(i=0; i<len; i+=8) {
2373         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2374         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2375         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2376         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2377         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2378         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2379         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2380         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2381     }
2382 }
2383 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2384     int i;
2385     if(min < 0 && max > 0) {
2386         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2387     } else {
2388         for(i=0; i < len; i+=8) {
2389             dst[i    ] = av_clipf(src[i    ], min, max);
2390             dst[i + 1] = av_clipf(src[i + 1], min, max);
2391             dst[i + 2] = av_clipf(src[i + 2], min, max);
2392             dst[i + 3] = av_clipf(src[i + 3], min, max);
2393             dst[i + 4] = av_clipf(src[i + 4], min, max);
2394             dst[i + 5] = av_clipf(src[i + 5], min, max);
2395             dst[i + 6] = av_clipf(src[i + 6], min, max);
2396             dst[i + 7] = av_clipf(src[i + 7], min, max);
2397         }
2398     }
2399 }
2400
2401 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2402 {
2403     int res = 0;
2404
2405     while (order--)
2406         res += *v1++ * *v2++;
2407
2408     return res;
2409 }
2410
2411 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2412 {
2413     int res = 0;
2414     while (order--) {
2415         res   += *v1 * *v2++;
2416         *v1++ += mul * *v3++;
2417     }
2418     return res;
2419 }
2420
2421 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2422                                  const int16_t *window, unsigned int len)
2423 {
2424     int i;
2425     int len2 = len >> 1;
2426
2427     for (i = 0; i < len2; i++) {
2428         int16_t w       = window[i];
2429         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2430         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2431     }
2432 }
2433
2434 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2435                                 int32_t max, unsigned int len)
2436 {
2437     do {
2438         *dst++ = av_clip(*src++, min, max);
2439         *dst++ = av_clip(*src++, min, max);
2440         *dst++ = av_clip(*src++, min, max);
2441         *dst++ = av_clip(*src++, min, max);
2442         *dst++ = av_clip(*src++, min, max);
2443         *dst++ = av_clip(*src++, min, max);
2444         *dst++ = av_clip(*src++, min, max);
2445         *dst++ = av_clip(*src++, min, max);
2446         len -= 8;
2447     } while (len > 0);
2448 }
2449
2450 static void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2451 {
2452     ff_j_rev_dct (block);
2453     put_pixels_clamped_c(block, dest, line_size);
2454 }
2455 static void ff_jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2456 {
2457     ff_j_rev_dct (block);
2458     add_pixels_clamped_c(block, dest, line_size);
2459 }
2460
2461 /* init static data */
2462 av_cold void ff_dsputil_static_init(void)
2463 {
2464     int i;
2465
2466     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2467     for(i=0;i<MAX_NEG_CROP;i++) {
2468         ff_cropTbl[i] = 0;
2469         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2470     }
2471
2472     for(i=0;i<512;i++) {
2473         ff_squareTbl[i] = (i - 256) * (i - 256);
2474     }
2475
2476     for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2477 }
2478
2479 int ff_check_alignment(void){
2480     static int did_fail=0;
2481     LOCAL_ALIGNED_16(int, aligned, [4]);
2482
2483     if((intptr_t)aligned & 15){
2484         if(!did_fail){
2485 #if HAVE_MMX || HAVE_ALTIVEC
2486             av_log(NULL, AV_LOG_ERROR,
2487                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2488                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2489                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2490                 "Do not report crashes to Libav developers.\n");
2491 #endif
2492             did_fail=1;
2493         }
2494         return -1;
2495     }
2496     return 0;
2497 }
2498
2499 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2500 {
2501     ff_check_alignment();
2502
2503 #if CONFIG_ENCODERS
2504     if (avctx->bits_per_raw_sample == 10) {
2505         c->fdct    = ff_jpeg_fdct_islow_10;
2506         c->fdct248 = ff_fdct248_islow_10;
2507     } else {
2508         if(avctx->dct_algo==FF_DCT_FASTINT) {
2509             c->fdct    = ff_fdct_ifast;
2510             c->fdct248 = ff_fdct_ifast248;
2511         }
2512         else if(avctx->dct_algo==FF_DCT_FAAN) {
2513             c->fdct    = ff_faandct;
2514             c->fdct248 = ff_faandct248;
2515         }
2516         else {
2517             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2518             c->fdct248 = ff_fdct248_islow_8;
2519         }
2520     }
2521 #endif //CONFIG_ENCODERS
2522
2523     if (avctx->bits_per_raw_sample == 10) {
2524         c->idct_put              = ff_simple_idct_put_10;
2525         c->idct_add              = ff_simple_idct_add_10;
2526         c->idct                  = ff_simple_idct_10;
2527         c->idct_permutation_type = FF_NO_IDCT_PERM;
2528     } else {
2529         if(avctx->idct_algo==FF_IDCT_INT){
2530             c->idct_put= ff_jref_idct_put;
2531             c->idct_add= ff_jref_idct_add;
2532             c->idct    = ff_j_rev_dct;
2533             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2534         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2535             c->idct_put= ff_faanidct_put;
2536             c->idct_add= ff_faanidct_add;
2537             c->idct    = ff_faanidct;
2538             c->idct_permutation_type= FF_NO_IDCT_PERM;
2539         }else{ //accurate/default
2540             c->idct_put = ff_simple_idct_put_8;
2541             c->idct_add = ff_simple_idct_add_8;
2542             c->idct     = ff_simple_idct_8;
2543             c->idct_permutation_type= FF_NO_IDCT_PERM;
2544         }
2545     }
2546
2547     c->diff_pixels = diff_pixels_c;
2548     c->put_pixels_clamped = put_pixels_clamped_c;
2549     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2550     c->add_pixels_clamped = add_pixels_clamped_c;
2551     c->sum_abs_dctelem = sum_abs_dctelem_c;
2552     c->gmc1 = gmc1_c;
2553     c->gmc = ff_gmc_c;
2554     c->pix_sum = pix_sum_c;
2555     c->pix_norm1 = pix_norm1_c;
2556
2557     c->fill_block_tab[0] = fill_block16_c;
2558     c->fill_block_tab[1] = fill_block8_c;
2559
2560     /* TODO [0] 16  [1] 8 */
2561     c->pix_abs[0][0] = pix_abs16_c;
2562     c->pix_abs[0][1] = pix_abs16_x2_c;
2563     c->pix_abs[0][2] = pix_abs16_y2_c;
2564     c->pix_abs[0][3] = pix_abs16_xy2_c;
2565     c->pix_abs[1][0] = pix_abs8_c;
2566     c->pix_abs[1][1] = pix_abs8_x2_c;
2567     c->pix_abs[1][2] = pix_abs8_y2_c;
2568     c->pix_abs[1][3] = pix_abs8_xy2_c;
2569
2570     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2571     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2572     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2573     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2574     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2575     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2576     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2577     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2578     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2579
2580     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2581     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2582     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2583     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2584     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2585     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2586     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2587     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2588     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2589
2590 #define dspfunc(PFX, IDX, NUM) \
2591     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2592     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2593     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2594     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2595     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2596     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2597     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2598     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2599     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2600     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2601     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2602     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2603     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2604     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2605     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2606     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2607
2608     dspfunc(put_qpel, 0, 16);
2609     dspfunc(put_no_rnd_qpel, 0, 16);
2610
2611     dspfunc(avg_qpel, 0, 16);
2612     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2613
2614     dspfunc(put_qpel, 1, 8);
2615     dspfunc(put_no_rnd_qpel, 1, 8);
2616
2617     dspfunc(avg_qpel, 1, 8);
2618     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2619
2620 #undef dspfunc
2621
2622     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2623     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2624     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2625     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2626     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2627     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2628     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2629     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2630
2631 #define SET_CMP_FUNC(name) \
2632     c->name[0]= name ## 16_c;\
2633     c->name[1]= name ## 8x8_c;
2634
2635     SET_CMP_FUNC(hadamard8_diff)
2636     c->hadamard8_diff[4]= hadamard8_intra16_c;
2637     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2638     SET_CMP_FUNC(dct_sad)
2639     SET_CMP_FUNC(dct_max)
2640 #if CONFIG_GPL
2641     SET_CMP_FUNC(dct264_sad)
2642 #endif
2643     c->sad[0]= pix_abs16_c;
2644     c->sad[1]= pix_abs8_c;
2645     c->sse[0]= sse16_c;
2646     c->sse[1]= sse8_c;
2647     c->sse[2]= sse4_c;
2648     SET_CMP_FUNC(quant_psnr)
2649     SET_CMP_FUNC(rd)
2650     SET_CMP_FUNC(bit)
2651     c->vsad[0]= vsad16_c;
2652     c->vsad[4]= vsad_intra16_c;
2653     c->vsad[5]= vsad_intra8_c;
2654     c->vsse[0]= vsse16_c;
2655     c->vsse[4]= vsse_intra16_c;
2656     c->vsse[5]= vsse_intra8_c;
2657     c->nsse[0]= nsse16_c;
2658     c->nsse[1]= nsse8_c;
2659
2660     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2661
2662     c->add_bytes= add_bytes_c;
2663     c->diff_bytes= diff_bytes_c;
2664     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2665     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2666     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
2667     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2668     c->bswap_buf= bswap_buf;
2669     c->bswap16_buf = bswap16_buf;
2670
2671     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2672         c->h263_h_loop_filter= h263_h_loop_filter_c;
2673         c->h263_v_loop_filter= h263_v_loop_filter_c;
2674     }
2675
2676     c->h261_loop_filter= h261_loop_filter_c;
2677
2678     c->try_8x8basis= try_8x8basis_c;
2679     c->add_8x8basis= add_8x8basis_c;
2680
2681     c->vector_clipf = vector_clipf_c;
2682     c->scalarproduct_int16 = scalarproduct_int16_c;
2683     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2684     c->apply_window_int16 = apply_window_int16_c;
2685     c->vector_clip_int32 = vector_clip_int32_c;
2686
2687     c->shrink[0]= av_image_copy_plane;
2688     c->shrink[1]= ff_shrink22;
2689     c->shrink[2]= ff_shrink44;
2690     c->shrink[3]= ff_shrink88;
2691
2692 #define hpel_funcs(prefix, idx, num) \
2693     c->prefix ## _pixels_tab idx [0] = prefix ## _pixels ## num ## _8_c; \
2694     c->prefix ## _pixels_tab idx [1] = prefix ## _pixels ## num ## _x2_8_c; \
2695     c->prefix ## _pixels_tab idx [2] = prefix ## _pixels ## num ## _y2_8_c; \
2696     c->prefix ## _pixels_tab idx [3] = prefix ## _pixels ## num ## _xy2_8_c
2697
2698     hpel_funcs(put, [0], 16);
2699     hpel_funcs(put, [1],  8);
2700     hpel_funcs(put, [2],  4);
2701     hpel_funcs(put, [3],  2);
2702     hpel_funcs(put_no_rnd, [0], 16);
2703     hpel_funcs(put_no_rnd, [1],  8);
2704     hpel_funcs(avg, [0], 16);
2705     hpel_funcs(avg, [1],  8);
2706     hpel_funcs(avg, [2],  4);
2707     hpel_funcs(avg, [3],  2);
2708     hpel_funcs(avg_no_rnd,[0], 16);
2709
2710 #undef FUNC
2711 #undef FUNCC
2712 #define FUNC(f, depth) f ## _ ## depth
2713 #define FUNCC(f, depth) f ## _ ## depth ## _c
2714
2715 #define dspfunc2(PFX, IDX, NUM, depth)\
2716     c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
2717     c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
2718     c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
2719     c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
2720     c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
2721     c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
2722     c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
2723     c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
2724     c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
2725     c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
2726     c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
2727     c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
2728     c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
2729     c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
2730     c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
2731     c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
2732
2733 #define BIT_DEPTH_FUNCS(depth, dct)\
2734     c->get_pixels                    = FUNCC(get_pixels   ## dct   , depth);\
2735     c->draw_edges                    = FUNCC(draw_edges            , depth);\
2736     c->clear_block                   = FUNCC(clear_block  ## dct   , depth);\
2737     c->clear_blocks                  = FUNCC(clear_blocks ## dct   , depth);\
2738     c->add_pixels8                   = FUNCC(add_pixels8  ## dct   , depth);\
2739     c->add_pixels4                   = FUNCC(add_pixels4  ## dct   , depth);\
2740 \
2741     c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
2742     c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
2743     c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
2744     c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
2745     c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
2746     c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
2747 \
2748     dspfunc2(put_h264_qpel, 0, 16, depth);\
2749     dspfunc2(put_h264_qpel, 1,  8, depth);\
2750     dspfunc2(put_h264_qpel, 2,  4, depth);\
2751     dspfunc2(put_h264_qpel, 3,  2, depth);\
2752     dspfunc2(avg_h264_qpel, 0, 16, depth);\
2753     dspfunc2(avg_h264_qpel, 1,  8, depth);\
2754     dspfunc2(avg_h264_qpel, 2,  4, depth);
2755
2756     switch (avctx->bits_per_raw_sample) {
2757     case 9:
2758         if (c->dct_bits == 32) {
2759             BIT_DEPTH_FUNCS(9, _32);
2760         } else {
2761             BIT_DEPTH_FUNCS(9, _16);
2762         }
2763         break;
2764     case 10:
2765         if (c->dct_bits == 32) {
2766             BIT_DEPTH_FUNCS(10, _32);
2767         } else {
2768             BIT_DEPTH_FUNCS(10, _16);
2769         }
2770         break;
2771     default:
2772         BIT_DEPTH_FUNCS(8, _16);
2773         break;
2774     }
2775
2776
2777     if (HAVE_MMX)        ff_dsputil_init_mmx   (c, avctx);
2778     if (ARCH_ARM)        ff_dsputil_init_arm   (c, avctx);
2779     if (HAVE_VIS)        ff_dsputil_init_vis   (c, avctx);
2780     if (ARCH_ALPHA)      ff_dsputil_init_alpha (c, avctx);
2781     if (ARCH_PPC)        ff_dsputil_init_ppc   (c, avctx);
2782     if (ARCH_SH4)        ff_dsputil_init_sh4   (c, avctx);
2783     if (ARCH_BFIN)       ff_dsputil_init_bfin  (c, avctx);
2784
2785     ff_init_scantable_permutation(c->idct_permutation,
2786                                   c->idct_permutation_type);
2787 }