git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41 #include "diracdsp.h"
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 #define BIT_DEPTH 9
  47 #include "dsputil_template.c"
  48 #undef BIT_DEPTH
  49
  50 #define BIT_DEPTH 10
  51 #include "dsputil_template.c"
  52 #undef BIT_DEPTH
  53
  54 #define BIT_DEPTH 8
  55 #include "dsputil_template.c"
  56
  57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  58 #define pb_7f (~0UL/255 * 0x7f)
  59 #define pb_80 (~0UL/255 * 0x80)
  60
  61 const uint8_t ff_zigzag_direct[64] = {
  62     0,   1,  8, 16,  9,  2,  3, 10,
  63     17, 24, 32, 25, 18, 11,  4,  5,
  64     12, 19, 26, 33, 40, 48, 41, 34,
  65     27, 20, 13,  6,  7, 14, 21, 28,
  66     35, 42, 49, 56, 57, 50, 43, 36,
  67     29, 22, 15, 23, 30, 37, 44, 51,
  68     58, 59, 52, 45, 38, 31, 39, 46,
  69     53, 60, 61, 54, 47, 55, 62, 63
  70 };
  71
  72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  73    specification, we interleave the fields */
  74 const uint8_t ff_zigzag248_direct[64] = {
  75      0,  8,  1,  9, 16, 24,  2, 10,
  76     17, 25, 32, 40, 48, 56, 33, 41,
  77     18, 26,  3, 11,  4, 12, 19, 27,
  78     34, 42, 49, 57, 50, 58, 35, 43,
  79     20, 28,  5, 13,  6, 14, 21, 29,
  80     36, 44, 51, 59, 52, 60, 37, 45,
  81     22, 30,  7, 15, 23, 31, 38, 46,
  82     53, 61, 54, 62, 39, 47, 55, 63,
  83 };
  84
  85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  86 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  87
  88 const uint8_t ff_alternate_horizontal_scan[64] = {
  89     0,  1,   2,  3,  8,  9, 16, 17,
  90     10, 11,  4,  5,  6,  7, 15, 14,
  91     13, 12, 19, 18, 24, 25, 32, 33,
  92     26, 27, 20, 21, 22, 23, 28, 29,
  93     30, 31, 34, 35, 40, 41, 48, 49,
  94     42, 43, 36, 37, 38, 39, 44, 45,
  95     46, 47, 50, 51, 56, 57, 58, 59,
  96     52, 53, 54, 55, 60, 61, 62, 63,
  97 };
  98
  99 const uint8_t ff_alternate_vertical_scan[64] = {
 100     0,  8,  16, 24,  1,  9,  2, 10,
 101     17, 25, 32, 40, 48, 56, 57, 49,
 102     41, 33, 26, 18,  3, 11,  4, 12,
 103     19, 27, 34, 42, 50, 58, 35, 43,
 104     51, 59, 20, 28,  5, 13,  6, 14,
 105     21, 29, 36, 44, 52, 60, 37, 45,
 106     53, 61, 22, 30,  7, 15, 23, 31,
 107     38, 46, 54, 62, 39, 47, 55, 63,
 108 };
 109
 110 /* Input permutation for the simple_idct_mmx */
 111 static const uint8_t simple_mmx_permutation[64]={
 112         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 113         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 114         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 115         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 116         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 117         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 118         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 119         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 120 };
 121
 122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 123
 124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 125     int i;
 126     int end;
 127
 128     st->scantable= src_scantable;
 129
 130     for(i=0; i<64; i++){
 131         int j;
 132         j = src_scantable[i];
 133         st->permutated[i] = permutation[j];
 134 #if ARCH_PPC
 135         st->inverse[j] = i;
 136 #endif
 137     }
 138
 139     end=-1;
 140     for(i=0; i<64; i++){
 141         int j;
 142         j = st->permutated[i];
 143         if(j>end) end=j;
 144         st->raster_end[i]= end;
 145     }
 146 }
 147
 148 void ff_init_scantable_permutation(uint8_t *idct_permutation,
 149                                    int idct_permutation_type)
 150 {
 151     int i;
 152
 153     switch(idct_permutation_type){
 154     case FF_NO_IDCT_PERM:
 155         for(i=0; i<64; i++)
 156             idct_permutation[i]= i;
 157         break;
 158     case FF_LIBMPEG2_IDCT_PERM:
 159         for(i=0; i<64; i++)
 160             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 161         break;
 162     case FF_SIMPLE_IDCT_PERM:
 163         for(i=0; i<64; i++)
 164             idct_permutation[i]= simple_mmx_permutation[i];
 165         break;
 166     case FF_TRANSPOSE_IDCT_PERM:
 167         for(i=0; i<64; i++)
 168             idct_permutation[i]= ((i&7)<<3) | (i>>3);
 169         break;
 170     case FF_PARTTRANS_IDCT_PERM:
 171         for(i=0; i<64; i++)
 172             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
 173         break;
 174     case FF_SSE2_IDCT_PERM:
 175         for(i=0; i<64; i++)
 176             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
 177         break;
 178     default:
 179         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
 180     }
 181 }
 182
 183 static int pix_sum_c(uint8_t * pix, int line_size)
 184 {
 185     int s, i, j;
 186
 187     s = 0;
 188     for (i = 0; i < 16; i++) {
 189         for (j = 0; j < 16; j += 8) {
 190             s += pix[0];
 191             s += pix[1];
 192             s += pix[2];
 193             s += pix[3];
 194             s += pix[4];
 195             s += pix[5];
 196             s += pix[6];
 197             s += pix[7];
 198             pix += 8;
 199         }
 200         pix += line_size - 16;
 201     }
 202     return s;
 203 }
 204
 205 static int pix_norm1_c(uint8_t * pix, int line_size)
 206 {
 207     int s, i, j;
 208     uint32_t *sq = ff_squareTbl + 256;
 209
 210     s = 0;
 211     for (i = 0; i < 16; i++) {
 212         for (j = 0; j < 16; j += 8) {
 213 #if 0
 214             s += sq[pix[0]];
 215             s += sq[pix[1]];
 216             s += sq[pix[2]];
 217             s += sq[pix[3]];
 218             s += sq[pix[4]];
 219             s += sq[pix[5]];
 220             s += sq[pix[6]];
 221             s += sq[pix[7]];
 222 #else
 223 #if HAVE_FAST_64BIT
 224             register uint64_t x=*(uint64_t*)pix;
 225             s += sq[x&0xff];
 226             s += sq[(x>>8)&0xff];
 227             s += sq[(x>>16)&0xff];
 228             s += sq[(x>>24)&0xff];
 229             s += sq[(x>>32)&0xff];
 230             s += sq[(x>>40)&0xff];
 231             s += sq[(x>>48)&0xff];
 232             s += sq[(x>>56)&0xff];
 233 #else
 234             register uint32_t x=*(uint32_t*)pix;
 235             s += sq[x&0xff];
 236             s += sq[(x>>8)&0xff];
 237             s += sq[(x>>16)&0xff];
 238             s += sq[(x>>24)&0xff];
 239             x=*(uint32_t*)(pix+4);
 240             s += sq[x&0xff];
 241             s += sq[(x>>8)&0xff];
 242             s += sq[(x>>16)&0xff];
 243             s += sq[(x>>24)&0xff];
 244 #endif
 245 #endif
 246             pix += 8;
 247         }
 248         pix += line_size - 16;
 249     }
 250     return s;
 251 }
 252
 253 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 254     int i;
 255
 256     for(i=0; i+8<=w; i+=8){
 257         dst[i+0]= av_bswap32(src[i+0]);
 258         dst[i+1]= av_bswap32(src[i+1]);
 259         dst[i+2]= av_bswap32(src[i+2]);
 260         dst[i+3]= av_bswap32(src[i+3]);
 261         dst[i+4]= av_bswap32(src[i+4]);
 262         dst[i+5]= av_bswap32(src[i+5]);
 263         dst[i+6]= av_bswap32(src[i+6]);
 264         dst[i+7]= av_bswap32(src[i+7]);
 265     }
 266     for(;i<w; i++){
 267         dst[i+0]= av_bswap32(src[i+0]);
 268     }
 269 }
 270
 271 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 272 {
 273     while (len--)
 274         *dst++ = av_bswap16(*src++);
 275 }
 276
 277 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 278 {
 279     int s, i;
 280     uint32_t *sq = ff_squareTbl + 256;
 281
 282     s = 0;
 283     for (i = 0; i < h; i++) {
 284         s += sq[pix1[0] - pix2[0]];
 285         s += sq[pix1[1] - pix2[1]];
 286         s += sq[pix1[2] - pix2[2]];
 287         s += sq[pix1[3] - pix2[3]];
 288         pix1 += line_size;
 289         pix2 += line_size;
 290     }
 291     return s;
 292 }
 293
 294 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 295 {
 296     int s, i;
 297     uint32_t *sq = ff_squareTbl + 256;
 298
 299     s = 0;
 300     for (i = 0; i < h; i++) {
 301         s += sq[pix1[0] - pix2[0]];
 302         s += sq[pix1[1] - pix2[1]];
 303         s += sq[pix1[2] - pix2[2]];
 304         s += sq[pix1[3] - pix2[3]];
 305         s += sq[pix1[4] - pix2[4]];
 306         s += sq[pix1[5] - pix2[5]];
 307         s += sq[pix1[6] - pix2[6]];
 308         s += sq[pix1[7] - pix2[7]];
 309         pix1 += line_size;
 310         pix2 += line_size;
 311     }
 312     return s;
 313 }
 314
 315 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 316 {
 317     int s, i;
 318     uint32_t *sq = ff_squareTbl + 256;
 319
 320     s = 0;
 321     for (i = 0; i < h; i++) {
 322         s += sq[pix1[ 0] - pix2[ 0]];
 323         s += sq[pix1[ 1] - pix2[ 1]];
 324         s += sq[pix1[ 2] - pix2[ 2]];
 325         s += sq[pix1[ 3] - pix2[ 3]];
 326         s += sq[pix1[ 4] - pix2[ 4]];
 327         s += sq[pix1[ 5] - pix2[ 5]];
 328         s += sq[pix1[ 6] - pix2[ 6]];
 329         s += sq[pix1[ 7] - pix2[ 7]];
 330         s += sq[pix1[ 8] - pix2[ 8]];
 331         s += sq[pix1[ 9] - pix2[ 9]];
 332         s += sq[pix1[10] - pix2[10]];
 333         s += sq[pix1[11] - pix2[11]];
 334         s += sq[pix1[12] - pix2[12]];
 335         s += sq[pix1[13] - pix2[13]];
 336         s += sq[pix1[14] - pix2[14]];
 337         s += sq[pix1[15] - pix2[15]];
 338
 339         pix1 += line_size;
 340         pix2 += line_size;
 341     }
 342     return s;
 343 }
 344
 345 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 346                           const uint8_t *s2, int stride){
 347     int i;
 348
 349     /* read the pixels */
 350     for(i=0;i<8;i++) {
 351         block[0] = s1[0] - s2[0];
 352         block[1] = s1[1] - s2[1];
 353         block[2] = s1[2] - s2[2];
 354         block[3] = s1[3] - s2[3];
 355         block[4] = s1[4] - s2[4];
 356         block[5] = s1[5] - s2[5];
 357         block[6] = s1[6] - s2[6];
 358         block[7] = s1[7] - s2[7];
 359         s1 += stride;
 360         s2 += stride;
 361         block += 8;
 362     }
 363 }
 364
 365
 366 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 367                              int line_size)
 368 {
 369     int i;
 370     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 371
 372     /* read the pixels */
 373     for(i=0;i<8;i++) {
 374         pixels[0] = cm[block[0]];
 375         pixels[1] = cm[block[1]];
 376         pixels[2] = cm[block[2]];
 377         pixels[3] = cm[block[3]];
 378         pixels[4] = cm[block[4]];
 379         pixels[5] = cm[block[5]];
 380         pixels[6] = cm[block[6]];
 381         pixels[7] = cm[block[7]];
 382
 383         pixels += line_size;
 384         block += 8;
 385     }
 386 }
 387
 388 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 389                                  int line_size)
 390 {
 391     int i;
 392     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 393
 394     /* read the pixels */
 395     for(i=0;i<4;i++) {
 396         pixels[0] = cm[block[0]];
 397         pixels[1] = cm[block[1]];
 398         pixels[2] = cm[block[2]];
 399         pixels[3] = cm[block[3]];
 400
 401         pixels += line_size;
 402         block += 8;
 403     }
 404 }
 405
 406 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 407                                  int line_size)
 408 {
 409     int i;
 410     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 411
 412     /* read the pixels */
 413     for(i=0;i<2;i++) {
 414         pixels[0] = cm[block[0]];
 415         pixels[1] = cm[block[1]];
 416
 417         pixels += line_size;
 418         block += 8;
 419     }
 420 }
 421
 422 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
 423                                     uint8_t *restrict pixels,
 424                                     int line_size)
 425 {
 426     int i, j;
 427
 428     for (i = 0; i < 8; i++) {
 429         for (j = 0; j < 8; j++) {
 430             if (*block < -128)
 431                 *pixels = 0;
 432             else if (*block > 127)
 433                 *pixels = 255;
 434             else
 435                 *pixels = (uint8_t)(*block + 128);
 436             block++;
 437             pixels++;
 438         }
 439         pixels += (line_size - 8);
 440     }
 441 }
 442
 443 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 444                              int line_size)
 445 {
 446     int i;
 447     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 448
 449     /* read the pixels */
 450     for(i=0;i<8;i++) {
 451         pixels[0] = cm[pixels[0] + block[0]];
 452         pixels[1] = cm[pixels[1] + block[1]];
 453         pixels[2] = cm[pixels[2] + block[2]];
 454         pixels[3] = cm[pixels[3] + block[3]];
 455         pixels[4] = cm[pixels[4] + block[4]];
 456         pixels[5] = cm[pixels[5] + block[5]];
 457         pixels[6] = cm[pixels[6] + block[6]];
 458         pixels[7] = cm[pixels[7] + block[7]];
 459         pixels += line_size;
 460         block += 8;
 461     }
 462 }
 463
 464 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 465                           int line_size)
 466 {
 467     int i;
 468     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 469
 470     /* read the pixels */
 471     for(i=0;i<4;i++) {
 472         pixels[0] = cm[pixels[0] + block[0]];
 473         pixels[1] = cm[pixels[1] + block[1]];
 474         pixels[2] = cm[pixels[2] + block[2]];
 475         pixels[3] = cm[pixels[3] + block[3]];
 476         pixels += line_size;
 477         block += 8;
 478     }
 479 }
 480
 481 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 482                           int line_size)
 483 {
 484     int i;
 485     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 486
 487     /* read the pixels */
 488     for(i=0;i<2;i++) {
 489         pixels[0] = cm[pixels[0] + block[0]];
 490         pixels[1] = cm[pixels[1] + block[1]];
 491         pixels += line_size;
 492         block += 8;
 493     }
 494 }
 495
 496 static int sum_abs_dctelem_c(DCTELEM *block)
 497 {
 498     int sum=0, i;
 499     for(i=0; i<64; i++)
 500         sum+= FFABS(block[i]);
 501     return sum;
 502 }
 503
 504 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 505 {
 506     int i;
 507
 508     for (i = 0; i < h; i++) {
 509         memset(block, value, 16);
 510         block += line_size;
 511     }
 512 }
 513
 514 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 515 {
 516     int i;
 517
 518     for (i = 0; i < h; i++) {
 519         memset(block, value, 8);
 520         block += line_size;
 521     }
 522 }
 523
 524 #define avg2(a,b) ((a+b+1)>>1)
 525 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 526
 527 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 528 {
 529     const int A=(16-x16)*(16-y16);
 530     const int B=(   x16)*(16-y16);
 531     const int C=(16-x16)*(   y16);
 532     const int D=(   x16)*(   y16);
 533     int i;
 534
 535     for(i=0; i<h; i++)
 536     {
 537         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 538         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 539         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 540         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 541         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 542         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 543         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 544         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 545         dst+= stride;
 546         src+= stride;
 547     }
 548 }
 549
 550 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 551                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 552 {
 553     int y, vx, vy;
 554     const int s= 1<<shift;
 555
 556     width--;
 557     height--;
 558
 559     for(y=0; y<h; y++){
 560         int x;
 561
 562         vx= ox;
 563         vy= oy;
 564         for(x=0; x<8; x++){ //XXX FIXME optimize
 565             int src_x, src_y, frac_x, frac_y, index;
 566
 567             src_x= vx>>16;
 568             src_y= vy>>16;
 569             frac_x= src_x&(s-1);
 570             frac_y= src_y&(s-1);
 571             src_x>>=shift;
 572             src_y>>=shift;
 573
 574             if((unsigned)src_x < width){
 575                 if((unsigned)src_y < height){
 576                     index= src_x + src_y*stride;
 577                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 578                                            + src[index       +1]*   frac_x )*(s-frac_y)
 579                                         + (  src[index+stride  ]*(s-frac_x)
 580                                            + src[index+stride+1]*   frac_x )*   frac_y
 581                                         + r)>>(shift*2);
 582                 }else{
 583                     index= src_x + av_clip(src_y, 0, height)*stride;
 584                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 585                                           + src[index       +1]*   frac_x )*s
 586                                         + r)>>(shift*2);
 587                 }
 588             }else{
 589                 if((unsigned)src_y < height){
 590                     index= av_clip(src_x, 0, width) + src_y*stride;
 591                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 592                                            + src[index+stride  ]*   frac_y )*s
 593                                         + r)>>(shift*2);
 594                 }else{
 595                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 596                     dst[y*stride + x]=    src[index         ];
 597                 }
 598             }
 599
 600             vx+= dxx;
 601             vy+= dyx;
 602         }
 603         ox += dxy;
 604         oy += dyy;
 605     }
 606 }
 607
 608 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 609     switch(width){
 610     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 611     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 612     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 613     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 614     }
 615 }
 616
 617 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 618     int i,j;
 619     for (i=0; i < height; i++) {
 620       for (j=0; j < width; j++) {
 621         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 622       }
 623       src += stride;
 624       dst += stride;
 625     }
 626 }
 627
 628 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 629     int i,j;
 630     for (i=0; i < height; i++) {
 631       for (j=0; j < width; j++) {
 632         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 633       }
 634       src += stride;
 635       dst += stride;
 636     }
 637 }
 638
 639 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 640     int i,j;
 641     for (i=0; i < height; i++) {
 642       for (j=0; j < width; j++) {
 643         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 644       }
 645       src += stride;
 646       dst += stride;
 647     }
 648 }
 649
 650 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 651     int i,j;
 652     for (i=0; i < height; i++) {
 653       for (j=0; j < width; j++) {
 654         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 655       }
 656       src += stride;
 657       dst += stride;
 658     }
 659 }
 660
 661 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 662     int i,j;
 663     for (i=0; i < height; i++) {
 664       for (j=0; j < width; j++) {
 665         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 666       }
 667       src += stride;
 668       dst += stride;
 669     }
 670 }
 671
 672 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 673     int i,j;
 674     for (i=0; i < height; i++) {
 675       for (j=0; j < width; j++) {
 676         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 677       }
 678       src += stride;
 679       dst += stride;
 680     }
 681 }
 682
 683 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 684     int i,j;
 685     for (i=0; i < height; i++) {
 686       for (j=0; j < width; j++) {
 687         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 688       }
 689       src += stride;
 690       dst += stride;
 691     }
 692 }
 693
 694 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 695     int i,j;
 696     for (i=0; i < height; i++) {
 697       for (j=0; j < width; j++) {
 698         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 699       }
 700       src += stride;
 701       dst += stride;
 702     }
 703 }
 704
 705 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 706     switch(width){
 707     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 708     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 709     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 710     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 711     }
 712 }
 713
 714 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 715     int i,j;
 716     for (i=0; i < height; i++) {
 717       for (j=0; j < width; j++) {
 718         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 719       }
 720       src += stride;
 721       dst += stride;
 722     }
 723 }
 724
 725 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 726     int i,j;
 727     for (i=0; i < height; i++) {
 728       for (j=0; j < width; j++) {
 729         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 730       }
 731       src += stride;
 732       dst += stride;
 733     }
 734 }
 735
 736 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 737     int i,j;
 738     for (i=0; i < height; i++) {
 739       for (j=0; j < width; j++) {
 740         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 741       }
 742       src += stride;
 743       dst += stride;
 744     }
 745 }
 746
 747 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 748     int i,j;
 749     for (i=0; i < height; i++) {
 750       for (j=0; j < width; j++) {
 751         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 752       }
 753       src += stride;
 754       dst += stride;
 755     }
 756 }
 757
 758 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 759     int i,j;
 760     for (i=0; i < height; i++) {
 761       for (j=0; j < width; j++) {
 762         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 763       }
 764       src += stride;
 765       dst += stride;
 766     }
 767 }
 768
 769 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 770     int i,j;
 771     for (i=0; i < height; i++) {
 772       for (j=0; j < width; j++) {
 773         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 774       }
 775       src += stride;
 776       dst += stride;
 777     }
 778 }
 779
 780 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 781     int i,j;
 782     for (i=0; i < height; i++) {
 783       for (j=0; j < width; j++) {
 784         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 785       }
 786       src += stride;
 787       dst += stride;
 788     }
 789 }
 790
 791 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 792     int i,j;
 793     for (i=0; i < height; i++) {
 794       for (j=0; j < width; j++) {
 795         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 796       }
 797       src += stride;
 798       dst += stride;
 799     }
 800 }
 801
 802 #define QPEL_MC(r, OPNAME, RND, OP) \
 803 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 804     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 805     int i;\
 806     for(i=0; i<h; i++)\
 807     {\
 808         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 809         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 810         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 811         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 812         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 813         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 814         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 815         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 816         dst+=dstStride;\
 817         src+=srcStride;\
 818     }\
 819 }\
 820 \
 821 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 822     const int w=8;\
 823     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 824     int i;\
 825     for(i=0; i<w; i++)\
 826     {\
 827         const int src0= src[0*srcStride];\
 828         const int src1= src[1*srcStride];\
 829         const int src2= src[2*srcStride];\
 830         const int src3= src[3*srcStride];\
 831         const int src4= src[4*srcStride];\
 832         const int src5= src[5*srcStride];\
 833         const int src6= src[6*srcStride];\
 834         const int src7= src[7*srcStride];\
 835         const int src8= src[8*srcStride];\
 836         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 837         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 838         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 839         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 840         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 841         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 842         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 843         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 844         dst++;\
 845         src++;\
 846     }\
 847 }\
 848 \
 849 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 850     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 851     int i;\
 852     \
 853     for(i=0; i<h; i++)\
 854     {\
 855         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 856         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 857         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 858         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 859         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 860         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 861         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 862         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 863         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 864         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 865         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 866         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 867         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 868         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 869         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 870         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 871         dst+=dstStride;\
 872         src+=srcStride;\
 873     }\
 874 }\
 875 \
 876 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 877     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 878     int i;\
 879     const int w=16;\
 880     for(i=0; i<w; i++)\
 881     {\
 882         const int src0= src[0*srcStride];\
 883         const int src1= src[1*srcStride];\
 884         const int src2= src[2*srcStride];\
 885         const int src3= src[3*srcStride];\
 886         const int src4= src[4*srcStride];\
 887         const int src5= src[5*srcStride];\
 888         const int src6= src[6*srcStride];\
 889         const int src7= src[7*srcStride];\
 890         const int src8= src[8*srcStride];\
 891         const int src9= src[9*srcStride];\
 892         const int src10= src[10*srcStride];\
 893         const int src11= src[11*srcStride];\
 894         const int src12= src[12*srcStride];\
 895         const int src13= src[13*srcStride];\
 896         const int src14= src[14*srcStride];\
 897         const int src15= src[15*srcStride];\
 898         const int src16= src[16*srcStride];\
 899         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 900         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 901         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 902         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 903         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 904         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 905         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 906         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 907         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 908         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 909         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 910         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 911         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 912         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 913         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 914         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 915         dst++;\
 916         src++;\
 917     }\
 918 }\
 919 \
 920 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
 921     uint8_t half[64];\
 922     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 923     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 924 }\
 925 \
 926 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
 927     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 928 }\
 929 \
 930 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
 931     uint8_t half[64];\
 932     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 933     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 934 }\
 935 \
 936 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
 937     uint8_t full[16*9];\
 938     uint8_t half[64];\
 939     copy_block9(full, src, 16, stride, 9);\
 940     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 941     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 942 }\
 943 \
 944 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
 945     uint8_t full[16*9];\
 946     copy_block9(full, src, 16, stride, 9);\
 947     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 948 }\
 949 \
 950 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
 951     uint8_t full[16*9];\
 952     uint8_t half[64];\
 953     copy_block9(full, src, 16, stride, 9);\
 954     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 955     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 956 }\
 957 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
 958     uint8_t full[16*9];\
 959     uint8_t halfH[72];\
 960     uint8_t halfV[64];\
 961     uint8_t halfHV[64];\
 962     copy_block9(full, src, 16, stride, 9);\
 963     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 964     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 965     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 966     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 967 }\
 968 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
 969     uint8_t full[16*9];\
 970     uint8_t halfH[72];\
 971     uint8_t halfHV[64];\
 972     copy_block9(full, src, 16, stride, 9);\
 973     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 974     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 975     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 976     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 977 }\
 978 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
 979     uint8_t full[16*9];\
 980     uint8_t halfH[72];\
 981     uint8_t halfV[64];\
 982     uint8_t halfHV[64];\
 983     copy_block9(full, src, 16, stride, 9);\
 984     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 985     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 986     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 987     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 988 }\
 989 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
 990     uint8_t full[16*9];\
 991     uint8_t halfH[72];\
 992     uint8_t halfHV[64];\
 993     copy_block9(full, src, 16, stride, 9);\
 994     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 995     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 996     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 997     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 998 }\
 999 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1000     uint8_t full[16*9];\
1001     uint8_t halfH[72];\
1002     uint8_t halfV[64];\
1003     uint8_t halfHV[64];\
1004     copy_block9(full, src, 16, stride, 9);\
1005     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1006     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1007     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1008     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1009 }\
1010 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1011     uint8_t full[16*9];\
1012     uint8_t halfH[72];\
1013     uint8_t halfHV[64];\
1014     copy_block9(full, src, 16, stride, 9);\
1015     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1017     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1019 }\
1020 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1021     uint8_t full[16*9];\
1022     uint8_t halfH[72];\
1023     uint8_t halfV[64];\
1024     uint8_t halfHV[64];\
1025     copy_block9(full, src, 16, stride, 9);\
1026     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1027     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1028     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1029     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1030 }\
1031 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1032     uint8_t full[16*9];\
1033     uint8_t halfH[72];\
1034     uint8_t halfHV[64];\
1035     copy_block9(full, src, 16, stride, 9);\
1036     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1037     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1038     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1039     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1040 }\
1041 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1042     uint8_t halfH[72];\
1043     uint8_t halfHV[64];\
1044     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1045     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1046     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1047 }\
1048 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1049     uint8_t halfH[72];\
1050     uint8_t halfHV[64];\
1051     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1052     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1053     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1054 }\
1055 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1056     uint8_t full[16*9];\
1057     uint8_t halfH[72];\
1058     uint8_t halfV[64];\
1059     uint8_t halfHV[64];\
1060     copy_block9(full, src, 16, stride, 9);\
1061     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1062     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1063     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1064     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1065 }\
1066 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1067     uint8_t full[16*9];\
1068     uint8_t halfH[72];\
1069     copy_block9(full, src, 16, stride, 9);\
1070     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1071     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1072     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1073 }\
1074 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1075     uint8_t full[16*9];\
1076     uint8_t halfH[72];\
1077     uint8_t halfV[64];\
1078     uint8_t halfHV[64];\
1079     copy_block9(full, src, 16, stride, 9);\
1080     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1081     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1082     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1083     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1084 }\
1085 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1086     uint8_t full[16*9];\
1087     uint8_t halfH[72];\
1088     copy_block9(full, src, 16, stride, 9);\
1089     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1090     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1091     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1092 }\
1093 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1094     uint8_t halfH[72];\
1095     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1096     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1097 }\
1098 \
1099 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1100     uint8_t half[256];\
1101     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1102     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1103 }\
1104 \
1105 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1106     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1107 }\
1108 \
1109 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1110     uint8_t half[256];\
1111     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1112     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1113 }\
1114 \
1115 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1116     uint8_t full[24*17];\
1117     uint8_t half[256];\
1118     copy_block17(full, src, 24, stride, 17);\
1119     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1120     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1121 }\
1122 \
1123 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1124     uint8_t full[24*17];\
1125     copy_block17(full, src, 24, stride, 17);\
1126     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1127 }\
1128 \
1129 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1130     uint8_t full[24*17];\
1131     uint8_t half[256];\
1132     copy_block17(full, src, 24, stride, 17);\
1133     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1134     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1135 }\
1136 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1137     uint8_t full[24*17];\
1138     uint8_t halfH[272];\
1139     uint8_t halfV[256];\
1140     uint8_t halfHV[256];\
1141     copy_block17(full, src, 24, stride, 17);\
1142     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1143     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1144     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1145     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1146 }\
1147 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1148     uint8_t full[24*17];\
1149     uint8_t halfH[272];\
1150     uint8_t halfHV[256];\
1151     copy_block17(full, src, 24, stride, 17);\
1152     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1153     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1154     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1155     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1156 }\
1157 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1158     uint8_t full[24*17];\
1159     uint8_t halfH[272];\
1160     uint8_t halfV[256];\
1161     uint8_t halfHV[256];\
1162     copy_block17(full, src, 24, stride, 17);\
1163     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1164     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1165     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1166     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1167 }\
1168 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1169     uint8_t full[24*17];\
1170     uint8_t halfH[272];\
1171     uint8_t halfHV[256];\
1172     copy_block17(full, src, 24, stride, 17);\
1173     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1174     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1175     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1176     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1177 }\
1178 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1179     uint8_t full[24*17];\
1180     uint8_t halfH[272];\
1181     uint8_t halfV[256];\
1182     uint8_t halfHV[256];\
1183     copy_block17(full, src, 24, stride, 17);\
1184     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1185     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1186     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1187     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1188 }\
1189 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1190     uint8_t full[24*17];\
1191     uint8_t halfH[272];\
1192     uint8_t halfHV[256];\
1193     copy_block17(full, src, 24, stride, 17);\
1194     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1195     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1196     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1197     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1198 }\
1199 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1200     uint8_t full[24*17];\
1201     uint8_t halfH[272];\
1202     uint8_t halfV[256];\
1203     uint8_t halfHV[256];\
1204     copy_block17(full, src, 24, stride, 17);\
1205     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1206     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1207     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1208     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1209 }\
1210 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1211     uint8_t full[24*17];\
1212     uint8_t halfH[272];\
1213     uint8_t halfHV[256];\
1214     copy_block17(full, src, 24, stride, 17);\
1215     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1217     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1219 }\
1220 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1221     uint8_t halfH[272];\
1222     uint8_t halfHV[256];\
1223     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1224     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1225     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1226 }\
1227 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1228     uint8_t halfH[272];\
1229     uint8_t halfHV[256];\
1230     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1231     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1232     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1233 }\
1234 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1235     uint8_t full[24*17];\
1236     uint8_t halfH[272];\
1237     uint8_t halfV[256];\
1238     uint8_t halfHV[256];\
1239     copy_block17(full, src, 24, stride, 17);\
1240     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1241     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1242     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1243     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1244 }\
1245 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1246     uint8_t full[24*17];\
1247     uint8_t halfH[272];\
1248     copy_block17(full, src, 24, stride, 17);\
1249     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1250     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1251     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1252 }\
1253 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1254     uint8_t full[24*17];\
1255     uint8_t halfH[272];\
1256     uint8_t halfV[256];\
1257     uint8_t halfHV[256];\
1258     copy_block17(full, src, 24, stride, 17);\
1259     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1260     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1261     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1262     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1263 }\
1264 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1265     uint8_t full[24*17];\
1266     uint8_t halfH[272];\
1267     copy_block17(full, src, 24, stride, 17);\
1268     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1269     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1270     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1271 }\
1272 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1273     uint8_t halfH[272];\
1274     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1275     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1276 }
1277
1278 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1279 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1280 #define op_put(a, b) a = cm[((b) + 16)>>5]
1281 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1282
1283 QPEL_MC(0, put_       , _       , op_put)
1284 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1285 QPEL_MC(0, avg_       , _       , op_avg)
1286 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1287 #undef op_avg
1288 #undef op_avg_no_rnd
1289 #undef op_put
1290 #undef op_put_no_rnd
1291
1292 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1293 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1294 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1295 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1296 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1297 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1298
1299 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1300     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1301     int i;
1302
1303     for(i=0; i<h; i++){
1304         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1305         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1306         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1307         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1308         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1309         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1310         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1311         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1312         dst+=dstStride;
1313         src+=srcStride;
1314     }
1315 }
1316
1317 #if CONFIG_RV40_DECODER
1318 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1319     put_pixels16_xy2_8_c(dst, src, stride, 16);
1320 }
1321 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1322     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1323 }
1324 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1325     put_pixels8_xy2_8_c(dst, src, stride, 8);
1326 }
1327 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1328     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1329 }
1330 #endif /* CONFIG_RV40_DECODER */
1331
1332 #if CONFIG_DIRAC_DECODER
1333 #define DIRAC_MC(OPNAME)\
1334 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1335 {\
1336      OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1337 }\
1338 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1339 {\
1340     OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1341 }\
1342 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1343 {\
1344     OPNAME ## _pixels16_8_c(dst   , src[0]   , stride, h);\
1345     OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1346 }\
1347 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1348 {\
1349     OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1350 }\
1351 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1352 {\
1353     OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1354 }\
1355 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1356 {\
1357     OPNAME ## _pixels16_l2_8(dst   , src[0]   , src[1]   , stride, stride, stride, h);\
1358     OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1359 }\
1360 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1361 {\
1362     OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1363 }\
1364 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1365 {\
1366     OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1367 }\
1368 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1369 {\
1370     OPNAME ## _pixels16_l4_8(dst   , src[0]   , src[1]   , src[2]   , src[3]   , stride, stride, stride, stride, stride, h);\
1371     OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1372 }
1373 DIRAC_MC(put)
1374 DIRAC_MC(avg)
1375 #endif
1376
1377 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1378     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1379     int i;
1380
1381     for(i=0; i<w; i++){
1382         const int src_1= src[ -srcStride];
1383         const int src0 = src[0          ];
1384         const int src1 = src[  srcStride];
1385         const int src2 = src[2*srcStride];
1386         const int src3 = src[3*srcStride];
1387         const int src4 = src[4*srcStride];
1388         const int src5 = src[5*srcStride];
1389         const int src6 = src[6*srcStride];
1390         const int src7 = src[7*srcStride];
1391         const int src8 = src[8*srcStride];
1392         const int src9 = src[9*srcStride];
1393         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1394         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1395         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1396         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1397         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1398         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1399         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1400         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1401         src++;
1402         dst++;
1403     }
1404 }
1405
1406 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1407     uint8_t half[64];
1408     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1409     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1410 }
1411
1412 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1413     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1414 }
1415
1416 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1417     uint8_t half[64];
1418     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1419     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1420 }
1421
1422 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1423     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1424 }
1425
1426 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1427     uint8_t halfH[88];
1428     uint8_t halfV[64];
1429     uint8_t halfHV[64];
1430     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1431     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1432     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1433     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1434 }
1435 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1436     uint8_t halfH[88];
1437     uint8_t halfV[64];
1438     uint8_t halfHV[64];
1439     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1440     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1441     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1442     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1443 }
1444 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1445     uint8_t halfH[88];
1446     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1447     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1448 }
1449
1450 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1451     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1452     int x;
1453     const int strength= ff_h263_loop_filter_strength[qscale];
1454
1455     for(x=0; x<8; x++){
1456         int d1, d2, ad1;
1457         int p0= src[x-2*stride];
1458         int p1= src[x-1*stride];
1459         int p2= src[x+0*stride];
1460         int p3= src[x+1*stride];
1461         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1462
1463         if     (d<-2*strength) d1= 0;
1464         else if(d<-  strength) d1=-2*strength - d;
1465         else if(d<   strength) d1= d;
1466         else if(d< 2*strength) d1= 2*strength - d;
1467         else                   d1= 0;
1468
1469         p1 += d1;
1470         p2 -= d1;
1471         if(p1&256) p1= ~(p1>>31);
1472         if(p2&256) p2= ~(p2>>31);
1473
1474         src[x-1*stride] = p1;
1475         src[x+0*stride] = p2;
1476
1477         ad1= FFABS(d1)>>1;
1478
1479         d2= av_clip((p0-p3)/4, -ad1, ad1);
1480
1481         src[x-2*stride] = p0 - d2;
1482         src[x+  stride] = p3 + d2;
1483     }
1484     }
1485 }
1486
1487 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1488     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1489     int y;
1490     const int strength= ff_h263_loop_filter_strength[qscale];
1491
1492     for(y=0; y<8; y++){
1493         int d1, d2, ad1;
1494         int p0= src[y*stride-2];
1495         int p1= src[y*stride-1];
1496         int p2= src[y*stride+0];
1497         int p3= src[y*stride+1];
1498         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1499
1500         if     (d<-2*strength) d1= 0;
1501         else if(d<-  strength) d1=-2*strength - d;
1502         else if(d<   strength) d1= d;
1503         else if(d< 2*strength) d1= 2*strength - d;
1504         else                   d1= 0;
1505
1506         p1 += d1;
1507         p2 -= d1;
1508         if(p1&256) p1= ~(p1>>31);
1509         if(p2&256) p2= ~(p2>>31);
1510
1511         src[y*stride-1] = p1;
1512         src[y*stride+0] = p2;
1513
1514         ad1= FFABS(d1)>>1;
1515
1516         d2= av_clip((p0-p3)/4, -ad1, ad1);
1517
1518         src[y*stride-2] = p0 - d2;
1519         src[y*stride+1] = p3 + d2;
1520     }
1521     }
1522 }
1523
1524 static void h261_loop_filter_c(uint8_t *src, int stride){
1525     int x,y,xy,yz;
1526     int temp[64];
1527
1528     for(x=0; x<8; x++){
1529         temp[x      ] = 4*src[x           ];
1530         temp[x + 7*8] = 4*src[x + 7*stride];
1531     }
1532     for(y=1; y<7; y++){
1533         for(x=0; x<8; x++){
1534             xy = y * stride + x;
1535             yz = y * 8 + x;
1536             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1537         }
1538     }
1539
1540     for(y=0; y<8; y++){
1541         src[  y*stride] = (temp[  y*8] + 2)>>2;
1542         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1543         for(x=1; x<7; x++){
1544             xy = y * stride + x;
1545             yz = y * 8 + x;
1546             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1547         }
1548     }
1549 }
1550
1551 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1552 {
1553     int s, i;
1554
1555     s = 0;
1556     for(i=0;i<h;i++) {
1557         s += abs(pix1[0] - pix2[0]);
1558         s += abs(pix1[1] - pix2[1]);
1559         s += abs(pix1[2] - pix2[2]);
1560         s += abs(pix1[3] - pix2[3]);
1561         s += abs(pix1[4] - pix2[4]);
1562         s += abs(pix1[5] - pix2[5]);
1563         s += abs(pix1[6] - pix2[6]);
1564         s += abs(pix1[7] - pix2[7]);
1565         s += abs(pix1[8] - pix2[8]);
1566         s += abs(pix1[9] - pix2[9]);
1567         s += abs(pix1[10] - pix2[10]);
1568         s += abs(pix1[11] - pix2[11]);
1569         s += abs(pix1[12] - pix2[12]);
1570         s += abs(pix1[13] - pix2[13]);
1571         s += abs(pix1[14] - pix2[14]);
1572         s += abs(pix1[15] - pix2[15]);
1573         pix1 += line_size;
1574         pix2 += line_size;
1575     }
1576     return s;
1577 }
1578
1579 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1580 {
1581     int s, i;
1582
1583     s = 0;
1584     for(i=0;i<h;i++) {
1585         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1586         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1587         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1588         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1589         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1590         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1591         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1592         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1593         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1594         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1595         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1596         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1597         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1598         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1599         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1600         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1601         pix1 += line_size;
1602         pix2 += line_size;
1603     }
1604     return s;
1605 }
1606
1607 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1608 {
1609     int s, i;
1610     uint8_t *pix3 = pix2 + line_size;
1611
1612     s = 0;
1613     for(i=0;i<h;i++) {
1614         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1615         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1616         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1617         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1618         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1619         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1620         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1621         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1622         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1623         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1624         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1625         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1626         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1627         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1628         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1629         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1630         pix1 += line_size;
1631         pix2 += line_size;
1632         pix3 += line_size;
1633     }
1634     return s;
1635 }
1636
1637 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1638 {
1639     int s, i;
1640     uint8_t *pix3 = pix2 + line_size;
1641
1642     s = 0;
1643     for(i=0;i<h;i++) {
1644         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1645         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1646         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1647         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1648         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1649         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1650         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1651         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1652         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1653         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1654         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1655         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1656         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1657         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1658         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1659         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1660         pix1 += line_size;
1661         pix2 += line_size;
1662         pix3 += line_size;
1663     }
1664     return s;
1665 }
1666
1667 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1668 {
1669     int s, i;
1670
1671     s = 0;
1672     for(i=0;i<h;i++) {
1673         s += abs(pix1[0] - pix2[0]);
1674         s += abs(pix1[1] - pix2[1]);
1675         s += abs(pix1[2] - pix2[2]);
1676         s += abs(pix1[3] - pix2[3]);
1677         s += abs(pix1[4] - pix2[4]);
1678         s += abs(pix1[5] - pix2[5]);
1679         s += abs(pix1[6] - pix2[6]);
1680         s += abs(pix1[7] - pix2[7]);
1681         pix1 += line_size;
1682         pix2 += line_size;
1683     }
1684     return s;
1685 }
1686
1687 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1688 {
1689     int s, i;
1690
1691     s = 0;
1692     for(i=0;i<h;i++) {
1693         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1694         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1695         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1696         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1697         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1698         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1699         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1700         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1701         pix1 += line_size;
1702         pix2 += line_size;
1703     }
1704     return s;
1705 }
1706
1707 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1708 {
1709     int s, i;
1710     uint8_t *pix3 = pix2 + line_size;
1711
1712     s = 0;
1713     for(i=0;i<h;i++) {
1714         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1715         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1716         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1717         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1718         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1719         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1720         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1721         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1722         pix1 += line_size;
1723         pix2 += line_size;
1724         pix3 += line_size;
1725     }
1726     return s;
1727 }
1728
1729 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1730 {
1731     int s, i;
1732     uint8_t *pix3 = pix2 + line_size;
1733
1734     s = 0;
1735     for(i=0;i<h;i++) {
1736         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1737         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1738         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1739         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1740         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1741         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1742         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1743         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1744         pix1 += line_size;
1745         pix2 += line_size;
1746         pix3 += line_size;
1747     }
1748     return s;
1749 }
1750
1751 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1752     MpegEncContext *c = v;
1753     int score1=0;
1754     int score2=0;
1755     int x,y;
1756
1757     for(y=0; y<h; y++){
1758         for(x=0; x<16; x++){
1759             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1760         }
1761         if(y+1<h){
1762             for(x=0; x<15; x++){
1763                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1764                              - s1[x+1] + s1[x+1+stride])
1765                         -FFABS(  s2[x  ] - s2[x  +stride]
1766                              - s2[x+1] + s2[x+1+stride]);
1767             }
1768         }
1769         s1+= stride;
1770         s2+= stride;
1771     }
1772
1773     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1774     else  return score1 + FFABS(score2)*8;
1775 }
1776
1777 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1778     MpegEncContext *c = v;
1779     int score1=0;
1780     int score2=0;
1781     int x,y;
1782
1783     for(y=0; y<h; y++){
1784         for(x=0; x<8; x++){
1785             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1786         }
1787         if(y+1<h){
1788             for(x=0; x<7; x++){
1789                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1790                              - s1[x+1] + s1[x+1+stride])
1791                         -FFABS(  s2[x  ] - s2[x  +stride]
1792                              - s2[x+1] + s2[x+1+stride]);
1793             }
1794         }
1795         s1+= stride;
1796         s2+= stride;
1797     }
1798
1799     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1800     else  return score1 + FFABS(score2)*8;
1801 }
1802
1803 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1804     int i;
1805     unsigned int sum=0;
1806
1807     for(i=0; i<8*8; i++){
1808         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1809         int w= weight[i];
1810         b>>= RECON_SHIFT;
1811         assert(-512<b && b<512);
1812
1813         sum += (w*b)*(w*b)>>4;
1814     }
1815     return sum>>2;
1816 }
1817
1818 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1819     int i;
1820
1821     for(i=0; i<8*8; i++){
1822         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1823     }
1824 }
1825
1826 /**
1827  * Permute an 8x8 block.
1828  * @param block the block which will be permuted according to the given permutation vector
1829  * @param permutation the permutation vector
1830  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1831  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1832  *                  (inverse) permutated to scantable order!
1833  */
1834 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1835 {
1836     int i;
1837     DCTELEM temp[64];
1838
1839     if(last<=0) return;
1840     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1841
1842     for(i=0; i<=last; i++){
1843         const int j= scantable[i];
1844         temp[j]= block[j];
1845         block[j]=0;
1846     }
1847
1848     for(i=0; i<=last; i++){
1849         const int j= scantable[i];
1850         const int perm_j= permutation[j];
1851         block[perm_j]= temp[j];
1852     }
1853 }
1854
1855 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1856     return 0;
1857 }
1858
1859 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1860     int i;
1861
1862     memset(cmp, 0, sizeof(void*)*6);
1863
1864     for(i=0; i<6; i++){
1865         switch(type&0xFF){
1866         case FF_CMP_SAD:
1867             cmp[i]= c->sad[i];
1868             break;
1869         case FF_CMP_SATD:
1870             cmp[i]= c->hadamard8_diff[i];
1871             break;
1872         case FF_CMP_SSE:
1873             cmp[i]= c->sse[i];
1874             break;
1875         case FF_CMP_DCT:
1876             cmp[i]= c->dct_sad[i];
1877             break;
1878         case FF_CMP_DCT264:
1879             cmp[i]= c->dct264_sad[i];
1880             break;
1881         case FF_CMP_DCTMAX:
1882             cmp[i]= c->dct_max[i];
1883             break;
1884         case FF_CMP_PSNR:
1885             cmp[i]= c->quant_psnr[i];
1886             break;
1887         case FF_CMP_BIT:
1888             cmp[i]= c->bit[i];
1889             break;
1890         case FF_CMP_RD:
1891             cmp[i]= c->rd[i];
1892             break;
1893         case FF_CMP_VSAD:
1894             cmp[i]= c->vsad[i];
1895             break;
1896         case FF_CMP_VSSE:
1897             cmp[i]= c->vsse[i];
1898             break;
1899         case FF_CMP_ZERO:
1900             cmp[i]= zero_cmp;
1901             break;
1902         case FF_CMP_NSSE:
1903             cmp[i]= c->nsse[i];
1904             break;
1905 #if CONFIG_DWT
1906         case FF_CMP_W53:
1907             cmp[i]= c->w53[i];
1908             break;
1909         case FF_CMP_W97:
1910             cmp[i]= c->w97[i];
1911             break;
1912 #endif
1913         default:
1914             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1915         }
1916     }
1917 }
1918
1919 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1920     long i;
1921     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1922         long a = *(long*)(src+i);
1923         long b = *(long*)(dst+i);
1924         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1925     }
1926     for(; i<w; i++)
1927         dst[i+0] += src[i+0];
1928 }
1929
1930 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1931     long i;
1932 #if !HAVE_FAST_UNALIGNED
1933     if((long)src2 & (sizeof(long)-1)){
1934         for(i=0; i+7<w; i+=8){
1935             dst[i+0] = src1[i+0]-src2[i+0];
1936             dst[i+1] = src1[i+1]-src2[i+1];
1937             dst[i+2] = src1[i+2]-src2[i+2];
1938             dst[i+3] = src1[i+3]-src2[i+3];
1939             dst[i+4] = src1[i+4]-src2[i+4];
1940             dst[i+5] = src1[i+5]-src2[i+5];
1941             dst[i+6] = src1[i+6]-src2[i+6];
1942             dst[i+7] = src1[i+7]-src2[i+7];
1943         }
1944     }else
1945 #endif
1946     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1947         long a = *(long*)(src1+i);
1948         long b = *(long*)(src2+i);
1949         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1950     }
1951     for(; i<w; i++)
1952         dst[i+0] = src1[i+0]-src2[i+0];
1953 }
1954
1955 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1956     int i;
1957     uint8_t l, lt;
1958
1959     l= *left;
1960     lt= *left_top;
1961
1962     for(i=0; i<w; i++){
1963         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1964         lt= src1[i];
1965         dst[i]= l;
1966     }
1967
1968     *left= l;
1969     *left_top= lt;
1970 }
1971
1972 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1973     int i;
1974     uint8_t l, lt;
1975
1976     l= *left;
1977     lt= *left_top;
1978
1979     for(i=0; i<w; i++){
1980         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1981         lt= src1[i];
1982         l= src2[i];
1983         dst[i]= l - pred;
1984     }
1985
1986     *left= l;
1987     *left_top= lt;
1988 }
1989
1990 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1991     int i;
1992
1993     for(i=0; i<w-1; i++){
1994         acc+= src[i];
1995         dst[i]= acc;
1996         i++;
1997         acc+= src[i];
1998         dst[i]= acc;
1999     }
2000
2001     for(; i<w; i++){
2002         acc+= src[i];
2003         dst[i]= acc;
2004     }
2005
2006     return acc;
2007 }
2008
2009 #if HAVE_BIGENDIAN
2010 #define B 3
2011 #define G 2
2012 #define R 1
2013 #define A 0
2014 #else
2015 #define B 0
2016 #define G 1
2017 #define R 2
2018 #define A 3
2019 #endif
2020 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2021     int i;
2022     int r,g,b,a;
2023     r= *red;
2024     g= *green;
2025     b= *blue;
2026     a= *alpha;
2027
2028     for(i=0; i<w; i++){
2029         b+= src[4*i+B];
2030         g+= src[4*i+G];
2031         r+= src[4*i+R];
2032         a+= src[4*i+A];
2033
2034         dst[4*i+B]= b;
2035         dst[4*i+G]= g;
2036         dst[4*i+R]= r;
2037         dst[4*i+A]= a;
2038     }
2039
2040     *red= r;
2041     *green= g;
2042     *blue= b;
2043     *alpha= a;
2044 }
2045 #undef B
2046 #undef G
2047 #undef R
2048 #undef A
2049
2050 #define BUTTERFLY2(o1,o2,i1,i2) \
2051 o1= (i1)+(i2);\
2052 o2= (i1)-(i2);
2053
2054 #define BUTTERFLY1(x,y) \
2055 {\
2056     int a,b;\
2057     a= x;\
2058     b= y;\
2059     x= a+b;\
2060     y= a-b;\
2061 }
2062
2063 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2064
2065 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2066     int i;
2067     int temp[64];
2068     int sum=0;
2069
2070     assert(h==8);
2071
2072     for(i=0; i<8; i++){
2073         //FIXME try pointer walks
2074         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2075         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2076         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2077         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2078
2079         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2080         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2081         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2082         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2083
2084         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2085         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2086         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2087         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2088     }
2089
2090     for(i=0; i<8; i++){
2091         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2092         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2093         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2094         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2095
2096         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2097         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2098         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2099         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2100
2101         sum +=
2102              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2103             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2104             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2105             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2106     }
2107     return sum;
2108 }
2109
2110 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2111     int i;
2112     int temp[64];
2113     int sum=0;
2114
2115     assert(h==8);
2116
2117     for(i=0; i<8; i++){
2118         //FIXME try pointer walks
2119         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2120         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2121         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2122         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2123
2124         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2125         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2126         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2127         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2128
2129         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2130         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2131         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2132         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2133     }
2134
2135     for(i=0; i<8; i++){
2136         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2137         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2138         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2139         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2140
2141         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2142         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2143         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2144         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2145
2146         sum +=
2147              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2148             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2149             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2150             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2151     }
2152
2153     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2154
2155     return sum;
2156 }
2157
2158 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2159     MpegEncContext * const s= (MpegEncContext *)c;
2160     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2161
2162     assert(h==8);
2163
2164     s->dsp.diff_pixels(temp, src1, src2, stride);
2165     s->dsp.fdct(temp);
2166     return s->dsp.sum_abs_dctelem(temp);
2167 }
2168
2169 #if CONFIG_GPL
2170 #define DCT8_1D {\
2171     const int s07 = SRC(0) + SRC(7);\
2172     const int s16 = SRC(1) + SRC(6);\
2173     const int s25 = SRC(2) + SRC(5);\
2174     const int s34 = SRC(3) + SRC(4);\
2175     const int a0 = s07 + s34;\
2176     const int a1 = s16 + s25;\
2177     const int a2 = s07 - s34;\
2178     const int a3 = s16 - s25;\
2179     const int d07 = SRC(0) - SRC(7);\
2180     const int d16 = SRC(1) - SRC(6);\
2181     const int d25 = SRC(2) - SRC(5);\
2182     const int d34 = SRC(3) - SRC(4);\
2183     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2184     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2185     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2186     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2187     DST(0,  a0 + a1     ) ;\
2188     DST(1,  a4 + (a7>>2)) ;\
2189     DST(2,  a2 + (a3>>1)) ;\
2190     DST(3,  a5 + (a6>>2)) ;\
2191     DST(4,  a0 - a1     ) ;\
2192     DST(5,  a6 - (a5>>2)) ;\
2193     DST(6, (a2>>1) - a3 ) ;\
2194     DST(7, (a4>>2) - a7 ) ;\
2195 }
2196
2197 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2198     MpegEncContext * const s= (MpegEncContext *)c;
2199     DCTELEM dct[8][8];
2200     int i;
2201     int sum=0;
2202
2203     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2204
2205 #define SRC(x) dct[i][x]
2206 #define DST(x,v) dct[i][x]= v
2207     for( i = 0; i < 8; i++ )
2208         DCT8_1D
2209 #undef SRC
2210 #undef DST
2211
2212 #define SRC(x) dct[x][i]
2213 #define DST(x,v) sum += FFABS(v)
2214     for( i = 0; i < 8; i++ )
2215         DCT8_1D
2216 #undef SRC
2217 #undef DST
2218     return sum;
2219 }
2220 #endif
2221
2222 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2223     MpegEncContext * const s= (MpegEncContext *)c;
2224     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2225     int sum=0, i;
2226
2227     assert(h==8);
2228
2229     s->dsp.diff_pixels(temp, src1, src2, stride);
2230     s->dsp.fdct(temp);
2231
2232     for(i=0; i<64; i++)
2233         sum= FFMAX(sum, FFABS(temp[i]));
2234
2235     return sum;
2236 }
2237
2238 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2239     MpegEncContext * const s= (MpegEncContext *)c;
2240     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2241     DCTELEM * const bak = temp+64;
2242     int sum=0, i;
2243
2244     assert(h==8);
2245     s->mb_intra=0;
2246
2247     s->dsp.diff_pixels(temp, src1, src2, stride);
2248
2249     memcpy(bak, temp, 64*sizeof(DCTELEM));
2250
2251     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2252     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2253     ff_simple_idct_8(temp); //FIXME
2254
2255     for(i=0; i<64; i++)
2256         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2257
2258     return sum;
2259 }
2260
2261 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2262     MpegEncContext * const s= (MpegEncContext *)c;
2263     const uint8_t *scantable= s->intra_scantable.permutated;
2264     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2265     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2266     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2267     int i, last, run, bits, level, distortion, start_i;
2268     const int esc_length= s->ac_esc_length;
2269     uint8_t * length;
2270     uint8_t * last_length;
2271
2272     assert(h==8);
2273
2274     copy_block8(lsrc1, src1, 8, stride, 8);
2275     copy_block8(lsrc2, src2, 8, stride, 8);
2276
2277     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2278
2279     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2280
2281     bits=0;
2282
2283     if (s->mb_intra) {
2284         start_i = 1;
2285         length     = s->intra_ac_vlc_length;
2286         last_length= s->intra_ac_vlc_last_length;
2287         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2288     } else {
2289         start_i = 0;
2290         length     = s->inter_ac_vlc_length;
2291         last_length= s->inter_ac_vlc_last_length;
2292     }
2293
2294     if(last>=start_i){
2295         run=0;
2296         for(i=start_i; i<last; i++){
2297             int j= scantable[i];
2298             level= temp[j];
2299
2300             if(level){
2301                 level+=64;
2302                 if((level&(~127)) == 0){
2303                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2304                 }else
2305                     bits+= esc_length;
2306                 run=0;
2307             }else
2308                 run++;
2309         }
2310         i= scantable[last];
2311
2312         level= temp[i] + 64;
2313
2314         assert(level - 64);
2315
2316         if((level&(~127)) == 0){
2317             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2318         }else
2319             bits+= esc_length;
2320
2321     }
2322
2323     if(last>=0){
2324         if(s->mb_intra)
2325             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2326         else
2327             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2328     }
2329
2330     s->dsp.idct_add(lsrc2, 8, temp);
2331
2332     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2333
2334     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2335 }
2336
2337 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2338     MpegEncContext * const s= (MpegEncContext *)c;
2339     const uint8_t *scantable= s->intra_scantable.permutated;
2340     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2341     int i, last, run, bits, level, start_i;
2342     const int esc_length= s->ac_esc_length;
2343     uint8_t * length;
2344     uint8_t * last_length;
2345
2346     assert(h==8);
2347
2348     s->dsp.diff_pixels(temp, src1, src2, stride);
2349
2350     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2351
2352     bits=0;
2353
2354     if (s->mb_intra) {
2355         start_i = 1;
2356         length     = s->intra_ac_vlc_length;
2357         last_length= s->intra_ac_vlc_last_length;
2358         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2359     } else {
2360         start_i = 0;
2361         length     = s->inter_ac_vlc_length;
2362         last_length= s->inter_ac_vlc_last_length;
2363     }
2364
2365     if(last>=start_i){
2366         run=0;
2367         for(i=start_i; i<last; i++){
2368             int j= scantable[i];
2369             level= temp[j];
2370
2371             if(level){
2372                 level+=64;
2373                 if((level&(~127)) == 0){
2374                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2375                 }else
2376                     bits+= esc_length;
2377                 run=0;
2378             }else
2379                 run++;
2380         }
2381         i= scantable[last];
2382
2383         level= temp[i] + 64;
2384
2385         assert(level - 64);
2386
2387         if((level&(~127)) == 0){
2388             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2389         }else
2390             bits+= esc_length;
2391     }
2392
2393     return bits;
2394 }
2395
2396 #define VSAD_INTRA(size) \
2397 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2398     int score=0;                                                                                            \
2399     int x,y;                                                                                                \
2400                                                                                                             \
2401     for(y=1; y<h; y++){                                                                                     \
2402         for(x=0; x<size; x+=4){                                                                             \
2403             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2404                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2405         }                                                                                                   \
2406         s+= stride;                                                                                         \
2407     }                                                                                                       \
2408                                                                                                             \
2409     return score;                                                                                           \
2410 }
2411 VSAD_INTRA(8)
2412 VSAD_INTRA(16)
2413
2414 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2415     int score=0;
2416     int x,y;
2417
2418     for(y=1; y<h; y++){
2419         for(x=0; x<16; x++){
2420             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2421         }
2422         s1+= stride;
2423         s2+= stride;
2424     }
2425
2426     return score;
2427 }
2428
2429 #define SQ(a) ((a)*(a))
2430 #define VSSE_INTRA(size) \
2431 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2432     int score=0;                                                                                            \
2433     int x,y;                                                                                                \
2434                                                                                                             \
2435     for(y=1; y<h; y++){                                                                                     \
2436         for(x=0; x<size; x+=4){                                                                               \
2437             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2438                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2439         }                                                                                                   \
2440         s+= stride;                                                                                         \
2441     }                                                                                                       \
2442                                                                                                             \
2443     return score;                                                                                           \
2444 }
2445 VSSE_INTRA(8)
2446 VSSE_INTRA(16)
2447
2448 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2449     int score=0;
2450     int x,y;
2451
2452     for(y=1; y<h; y++){
2453         for(x=0; x<16; x++){
2454             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2455         }
2456         s1+= stride;
2457         s2+= stride;
2458     }
2459
2460     return score;
2461 }
2462
2463 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2464                                int size){
2465     int score=0;
2466     int i;
2467     for(i=0; i<size; i++)
2468         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2469     return score;
2470 }
2471
2472 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2473 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2474 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2475 #if CONFIG_GPL
2476 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2477 #endif
2478 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2479 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2480 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2481 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2482
2483 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2484     int i;
2485     for(i=0; i<len; i++)
2486         dst[i] = src0[i] * src1[i];
2487 }
2488
2489 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2490     int i;
2491     src1 += len-1;
2492     for(i=0; i<len; i++)
2493         dst[i] = src0[i] * src1[-i];
2494 }
2495
2496 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2497     int i;
2498     for(i=0; i<len; i++)
2499         dst[i] = src0[i] * src1[i] + src2[i];
2500 }
2501
2502 static void vector_fmul_window_c(float *dst, const float *src0,
2503                                  const float *src1, const float *win, int len)
2504 {
2505     int i,j;
2506     dst += len;
2507     win += len;
2508     src0+= len;
2509     for(i=-len, j=len-1; i<0; i++, j--) {
2510         float s0 = src0[i];
2511         float s1 = src1[j];
2512         float wi = win[i];
2513         float wj = win[j];
2514         dst[i] = s0*wj - s1*wi;
2515         dst[j] = s0*wi + s1*wj;
2516     }
2517 }
2518
2519 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2520                                  int len)
2521 {
2522     int i;
2523     for (i = 0; i < len; i++)
2524         dst[i] = src[i] * mul;
2525 }
2526
2527 static void vector_fmac_scalar_c(float *dst, const float *src, float mul,
2528                                  int len)
2529 {
2530     int i;
2531     for (i = 0; i < len; i++)
2532         dst[i] += src[i] * mul;
2533 }
2534
2535 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2536                                 int len)
2537 {
2538     int i;
2539     for (i = 0; i < len; i++) {
2540         float t = v1[i] - v2[i];
2541         v1[i] += v2[i];
2542         v2[i] = t;
2543     }
2544 }
2545
2546 static void butterflies_float_interleave_c(float *dst, const float *src0,
2547                                            const float *src1, int len)
2548 {
2549     int i;
2550     for (i = 0; i < len; i++) {
2551         float f1 = src0[i];
2552         float f2 = src1[i];
2553         dst[2*i    ] = f1 + f2;
2554         dst[2*i + 1] = f1 - f2;
2555     }
2556 }
2557
2558 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2559 {
2560     float p = 0.0;
2561     int i;
2562
2563     for (i = 0; i < len; i++)
2564         p += v1[i] * v2[i];
2565
2566     return p;
2567 }
2568
2569 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2570                    uint32_t maxi, uint32_t maxisign)
2571 {
2572
2573     if(a > mini) return mini;
2574     else if((a^(1U<<31)) > maxisign) return maxi;
2575     else return a;
2576 }
2577
2578 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2579     int i;
2580     uint32_t mini = *(uint32_t*)min;
2581     uint32_t maxi = *(uint32_t*)max;
2582     uint32_t maxisign = maxi ^ (1U<<31);
2583     uint32_t *dsti = (uint32_t*)dst;
2584     const uint32_t *srci = (const uint32_t*)src;
2585     for(i=0; i<len; i+=8) {
2586         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2587         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2588         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2589         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2590         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2591         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2592         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2593         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2594     }
2595 }
2596 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2597     int i;
2598     if(min < 0 && max > 0) {
2599         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2600     } else {
2601         for(i=0; i < len; i+=8) {
2602             dst[i    ] = av_clipf(src[i    ], min, max);
2603             dst[i + 1] = av_clipf(src[i + 1], min, max);
2604             dst[i + 2] = av_clipf(src[i + 2], min, max);
2605             dst[i + 3] = av_clipf(src[i + 3], min, max);
2606             dst[i + 4] = av_clipf(src[i + 4], min, max);
2607             dst[i + 5] = av_clipf(src[i + 5], min, max);
2608             dst[i + 6] = av_clipf(src[i + 6], min, max);
2609             dst[i + 7] = av_clipf(src[i + 7], min, max);
2610         }
2611     }
2612 }
2613
2614 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2615 {
2616     int res = 0;
2617
2618     while (order--)
2619         res += (*v1++ * *v2++) >> shift;
2620
2621     return res;
2622 }
2623
2624 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2625 {
2626     int res = 0;
2627     while (order--) {
2628         res   += *v1 * *v2++;
2629         *v1++ += mul * *v3++;
2630     }
2631     return res;
2632 }
2633
2634 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2635                                  const int16_t *window, unsigned int len)
2636 {
2637     int i;
2638     int len2 = len >> 1;
2639
2640     for (i = 0; i < len2; i++) {
2641         int16_t w       = window[i];
2642         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2643         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2644     }
2645 }
2646
2647 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2648                                 int32_t max, unsigned int len)
2649 {
2650     do {
2651         *dst++ = av_clip(*src++, min, max);
2652         *dst++ = av_clip(*src++, min, max);
2653         *dst++ = av_clip(*src++, min, max);
2654         *dst++ = av_clip(*src++, min, max);
2655         *dst++ = av_clip(*src++, min, max);
2656         *dst++ = av_clip(*src++, min, max);
2657         *dst++ = av_clip(*src++, min, max);
2658         *dst++ = av_clip(*src++, min, max);
2659         len -= 8;
2660     } while (len > 0);
2661 }
2662
2663 #define W0 2048
2664 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2665 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2666 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2667 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2668 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2669 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2670 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2671
2672 static void wmv2_idct_row(short * b)
2673 {
2674     int s1,s2;
2675     int a0,a1,a2,a3,a4,a5,a6,a7;
2676     /*step 1*/
2677     a1 = W1*b[1]+W7*b[7];
2678     a7 = W7*b[1]-W1*b[7];
2679     a5 = W5*b[5]+W3*b[3];
2680     a3 = W3*b[5]-W5*b[3];
2681     a2 = W2*b[2]+W6*b[6];
2682     a6 = W6*b[2]-W2*b[6];
2683     a0 = W0*b[0]+W0*b[4];
2684     a4 = W0*b[0]-W0*b[4];
2685     /*step 2*/
2686     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2687     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2688     /*step 3*/
2689     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2690     b[1] = (a4+a6 +s1   + (1<<7))>>8;
2691     b[2] = (a4-a6 +s2   + (1<<7))>>8;
2692     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2693     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2694     b[5] = (a4-a6 -s2   + (1<<7))>>8;
2695     b[6] = (a4+a6 -s1   + (1<<7))>>8;
2696     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2697 }
2698 static void wmv2_idct_col(short * b)
2699 {
2700     int s1,s2;
2701     int a0,a1,a2,a3,a4,a5,a6,a7;
2702     /*step 1, with extended precision*/
2703     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2704     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2705     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2706     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2707     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2708     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2709     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2710     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2711     /*step 2*/
2712     s1 = (181*(a1-a5+a7-a3)+128)>>8;
2713     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2714     /*step 3*/
2715     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2716     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2717     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2718     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2719
2720     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2721     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2722     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2723     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2724 }
2725 void ff_wmv2_idct_c(short * block){
2726     int i;
2727
2728     for(i=0;i<64;i+=8){
2729         wmv2_idct_row(block+i);
2730     }
2731     for(i=0;i<8;i++){
2732         wmv2_idct_col(block+i);
2733     }
2734 }
2735 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2736  converted */
2737 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2738 {
2739     ff_wmv2_idct_c(block);
2740     ff_put_pixels_clamped_c(block, dest, line_size);
2741 }
2742 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2743 {
2744     ff_wmv2_idct_c(block);
2745     ff_add_pixels_clamped_c(block, dest, line_size);
2746 }
2747 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2748 {
2749     j_rev_dct (block);
2750     ff_put_pixels_clamped_c(block, dest, line_size);
2751 }
2752 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2753 {
2754     j_rev_dct (block);
2755     ff_add_pixels_clamped_c(block, dest, line_size);
2756 }
2757
2758 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2759 {
2760     j_rev_dct4 (block);
2761     put_pixels_clamped4_c(block, dest, line_size);
2762 }
2763 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2764 {
2765     j_rev_dct4 (block);
2766     add_pixels_clamped4_c(block, dest, line_size);
2767 }
2768
2769 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2770 {
2771     j_rev_dct2 (block);
2772     put_pixels_clamped2_c(block, dest, line_size);
2773 }
2774 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2775 {
2776     j_rev_dct2 (block);
2777     add_pixels_clamped2_c(block, dest, line_size);
2778 }
2779
2780 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2781 {
2782     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2783
2784     dest[0] = cm[(block[0] + 4)>>3];
2785 }
2786 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2787 {
2788     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2789
2790     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2791 }
2792
2793 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2794
2795 /* init static data */
2796 av_cold void dsputil_static_init(void)
2797 {
2798     int i;
2799
2800     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2801     for(i=0;i<MAX_NEG_CROP;i++) {
2802         ff_cropTbl[i] = 0;
2803         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2804     }
2805
2806     for(i=0;i<512;i++) {
2807         ff_squareTbl[i] = (i - 256) * (i - 256);
2808     }
2809
2810     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2811 }
2812
2813 int ff_check_alignment(void){
2814     static int did_fail=0;
2815     LOCAL_ALIGNED_16(int, aligned, [4]);
2816
2817     if((intptr_t)aligned & 15){
2818         if(!did_fail){
2819 #if HAVE_MMX || HAVE_ALTIVEC
2820             av_log(NULL, AV_LOG_ERROR,
2821                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2822                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2823                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2824                 "Do not report crashes to FFmpeg developers.\n");
2825 #endif
2826             did_fail=1;
2827         }
2828         return -1;
2829     }
2830     return 0;
2831 }
2832
2833 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2834 {
2835     int i;
2836
2837     ff_check_alignment();
2838
2839 #if CONFIG_ENCODERS
2840     if (avctx->bits_per_raw_sample == 10) {
2841         c->fdct    = ff_jpeg_fdct_islow_10;
2842         c->fdct248 = ff_fdct248_islow_10;
2843     } else {
2844         if(avctx->dct_algo==FF_DCT_FASTINT) {
2845             c->fdct    = fdct_ifast;
2846             c->fdct248 = fdct_ifast248;
2847         }
2848         else if(avctx->dct_algo==FF_DCT_FAAN) {
2849             c->fdct    = ff_faandct;
2850             c->fdct248 = ff_faandct248;
2851         }
2852         else {
2853             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2854             c->fdct248 = ff_fdct248_islow_8;
2855         }
2856     }
2857 #endif //CONFIG_ENCODERS
2858
2859     if(avctx->lowres==1){
2860         c->idct_put= ff_jref_idct4_put;
2861         c->idct_add= ff_jref_idct4_add;
2862         c->idct    = j_rev_dct4;
2863         c->idct_permutation_type= FF_NO_IDCT_PERM;
2864     }else if(avctx->lowres==2){
2865         c->idct_put= ff_jref_idct2_put;
2866         c->idct_add= ff_jref_idct2_add;
2867         c->idct    = j_rev_dct2;
2868         c->idct_permutation_type= FF_NO_IDCT_PERM;
2869     }else if(avctx->lowres==3){
2870         c->idct_put= ff_jref_idct1_put;
2871         c->idct_add= ff_jref_idct1_add;
2872         c->idct    = j_rev_dct1;
2873         c->idct_permutation_type= FF_NO_IDCT_PERM;
2874     }else{
2875         if (avctx->bits_per_raw_sample == 10) {
2876             c->idct_put              = ff_simple_idct_put_10;
2877             c->idct_add              = ff_simple_idct_add_10;
2878             c->idct                  = ff_simple_idct_10;
2879             c->idct_permutation_type = FF_NO_IDCT_PERM;
2880         } else {
2881         if(avctx->idct_algo==FF_IDCT_INT){
2882             c->idct_put= ff_jref_idct_put;
2883             c->idct_add= ff_jref_idct_add;
2884             c->idct    = j_rev_dct;
2885             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2886         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2887                 avctx->idct_algo==FF_IDCT_VP3){
2888             c->idct_put= ff_vp3_idct_put_c;
2889             c->idct_add= ff_vp3_idct_add_c;
2890             c->idct    = ff_vp3_idct_c;
2891             c->idct_permutation_type= FF_NO_IDCT_PERM;
2892         }else if(avctx->idct_algo==FF_IDCT_WMV2){
2893             c->idct_put= ff_wmv2_idct_put_c;
2894             c->idct_add= ff_wmv2_idct_add_c;
2895             c->idct    = ff_wmv2_idct_c;
2896             c->idct_permutation_type= FF_NO_IDCT_PERM;
2897         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2898             c->idct_put= ff_faanidct_put;
2899             c->idct_add= ff_faanidct_add;
2900             c->idct    = ff_faanidct;
2901             c->idct_permutation_type= FF_NO_IDCT_PERM;
2902         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2903             c->idct_put= ff_ea_idct_put_c;
2904             c->idct_permutation_type= FF_NO_IDCT_PERM;
2905         }else{ //accurate/default
2906             c->idct_put = ff_simple_idct_put_8;
2907             c->idct_add = ff_simple_idct_add_8;
2908             c->idct     = ff_simple_idct_8;
2909             c->idct_permutation_type= FF_NO_IDCT_PERM;
2910         }
2911         }
2912     }
2913
2914     c->diff_pixels = diff_pixels_c;
2915     c->put_pixels_clamped = ff_put_pixels_clamped_c;
2916     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2917     c->add_pixels_clamped = ff_add_pixels_clamped_c;
2918     c->sum_abs_dctelem = sum_abs_dctelem_c;
2919     c->gmc1 = gmc1_c;
2920     c->gmc = ff_gmc_c;
2921     c->pix_sum = pix_sum_c;
2922     c->pix_norm1 = pix_norm1_c;
2923
2924     c->fill_block_tab[0] = fill_block16_c;
2925     c->fill_block_tab[1] = fill_block8_c;
2926
2927     /* TODO [0] 16  [1] 8 */
2928     c->pix_abs[0][0] = pix_abs16_c;
2929     c->pix_abs[0][1] = pix_abs16_x2_c;
2930     c->pix_abs[0][2] = pix_abs16_y2_c;
2931     c->pix_abs[0][3] = pix_abs16_xy2_c;
2932     c->pix_abs[1][0] = pix_abs8_c;
2933     c->pix_abs[1][1] = pix_abs8_x2_c;
2934     c->pix_abs[1][2] = pix_abs8_y2_c;
2935     c->pix_abs[1][3] = pix_abs8_xy2_c;
2936
2937     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2938     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2939     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2940     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2941     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2942     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2943     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2944     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2945     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2946
2947     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2948     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2949     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2950     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2951     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2952     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2953     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2954     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2955     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2956
2957 #define dspfunc(PFX, IDX, NUM) \
2958     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2959     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2960     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2961     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2962     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2963     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2964     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2965     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2966     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2967     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2968     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2969     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2970     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2971     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2972     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2973     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2974
2975     dspfunc(put_qpel, 0, 16);
2976     dspfunc(put_no_rnd_qpel, 0, 16);
2977
2978     dspfunc(avg_qpel, 0, 16);
2979     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2980
2981     dspfunc(put_qpel, 1, 8);
2982     dspfunc(put_no_rnd_qpel, 1, 8);
2983
2984     dspfunc(avg_qpel, 1, 8);
2985     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2986
2987 #undef dspfunc
2988
2989 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2990     ff_mlp_init(c, avctx);
2991 #endif
2992 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2993     ff_intrax8dsp_init(c,avctx);
2994 #endif
2995
2996     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2997     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2998     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2999     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3000     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3001     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3002     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3003     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3004
3005 #define SET_CMP_FUNC(name) \
3006     c->name[0]= name ## 16_c;\
3007     c->name[1]= name ## 8x8_c;
3008
3009     SET_CMP_FUNC(hadamard8_diff)
3010     c->hadamard8_diff[4]= hadamard8_intra16_c;
3011     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3012     SET_CMP_FUNC(dct_sad)
3013     SET_CMP_FUNC(dct_max)
3014 #if CONFIG_GPL
3015     SET_CMP_FUNC(dct264_sad)
3016 #endif
3017     c->sad[0]= pix_abs16_c;
3018     c->sad[1]= pix_abs8_c;
3019     c->sse[0]= sse16_c;
3020     c->sse[1]= sse8_c;
3021     c->sse[2]= sse4_c;
3022     SET_CMP_FUNC(quant_psnr)
3023     SET_CMP_FUNC(rd)
3024     SET_CMP_FUNC(bit)
3025     c->vsad[0]= vsad16_c;
3026     c->vsad[4]= vsad_intra16_c;
3027     c->vsad[5]= vsad_intra8_c;
3028     c->vsse[0]= vsse16_c;
3029     c->vsse[4]= vsse_intra16_c;
3030     c->vsse[5]= vsse_intra8_c;
3031     c->nsse[0]= nsse16_c;
3032     c->nsse[1]= nsse8_c;
3033 #if CONFIG_DWT
3034     ff_dsputil_init_dwt(c);
3035 #endif
3036
3037     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3038
3039     c->add_bytes= add_bytes_c;
3040     c->diff_bytes= diff_bytes_c;
3041     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3042     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3043     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
3044     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3045     c->bswap_buf= bswap_buf;
3046     c->bswap16_buf = bswap16_buf;
3047
3048     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3049         c->h263_h_loop_filter= h263_h_loop_filter_c;
3050         c->h263_v_loop_filter= h263_v_loop_filter_c;
3051     }
3052
3053     if (CONFIG_VP3_DECODER) {
3054         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3055         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3056         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3057     }
3058
3059     c->h261_loop_filter= h261_loop_filter_c;
3060
3061     c->try_8x8basis= try_8x8basis_c;
3062     c->add_8x8basis= add_8x8basis_c;
3063
3064 #if CONFIG_VORBIS_DECODER
3065     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3066 #endif
3067 #if CONFIG_AC3_DECODER
3068     c->ac3_downmix = ff_ac3_downmix_c;
3069 #endif
3070     c->vector_fmul = vector_fmul_c;
3071     c->vector_fmul_reverse = vector_fmul_reverse_c;
3072     c->vector_fmul_add = vector_fmul_add_c;
3073     c->vector_fmul_window = vector_fmul_window_c;
3074     c->vector_clipf = vector_clipf_c;
3075     c->scalarproduct_int16 = scalarproduct_int16_c;
3076     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3077     c->apply_window_int16 = apply_window_int16_c;
3078     c->vector_clip_int32 = vector_clip_int32_c;
3079     c->scalarproduct_float = scalarproduct_float_c;
3080     c->butterflies_float = butterflies_float_c;
3081     c->butterflies_float_interleave = butterflies_float_interleave_c;
3082     c->vector_fmul_scalar = vector_fmul_scalar_c;
3083     c->vector_fmac_scalar = vector_fmac_scalar_c;
3084
3085     c->shrink[0]= av_image_copy_plane;
3086     c->shrink[1]= ff_shrink22;
3087     c->shrink[2]= ff_shrink44;
3088     c->shrink[3]= ff_shrink88;
3089
3090     c->prefetch= just_return;
3091
3092     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3093     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3094
3095 #undef FUNC
3096 #undef FUNCC
3097 #define FUNC(f, depth) f ## _ ## depth
3098 #define FUNCC(f, depth) f ## _ ## depth ## _c
3099
3100 #define dspfunc1(PFX, IDX, NUM, depth)\
3101     c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
3102     c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3103     c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3104     c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3105
3106 #define dspfunc2(PFX, IDX, NUM, depth)\
3107     c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3108     c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3109     c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3110     c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3111     c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3112     c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3113     c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3114     c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3115     c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3116     c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3117     c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3118     c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3119     c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3120     c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3121     c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3122     c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3123
3124
3125 #define BIT_DEPTH_FUNCS(depth, dct)\
3126     c->get_pixels                    = FUNCC(get_pixels   ## dct   , depth);\
3127     c->draw_edges                    = FUNCC(draw_edges            , depth);\
3128     c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
3129     c->clear_block                   = FUNCC(clear_block  ## dct   , depth);\
3130     c->clear_blocks                  = FUNCC(clear_blocks ## dct   , depth);\
3131     c->add_pixels8                   = FUNCC(add_pixels8  ## dct   , depth);\
3132     c->add_pixels4                   = FUNCC(add_pixels4  ## dct   , depth);\
3133     c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
3134     c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3135 \
3136     c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
3137     c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
3138     c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
3139     c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
3140     c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
3141     c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
3142 \
3143     dspfunc1(put       , 0, 16, depth);\
3144     dspfunc1(put       , 1,  8, depth);\
3145     dspfunc1(put       , 2,  4, depth);\
3146     dspfunc1(put       , 3,  2, depth);\
3147     dspfunc1(put_no_rnd, 0, 16, depth);\
3148     dspfunc1(put_no_rnd, 1,  8, depth);\
3149     dspfunc1(avg       , 0, 16, depth);\
3150     dspfunc1(avg       , 1,  8, depth);\
3151     dspfunc1(avg       , 2,  4, depth);\
3152     dspfunc1(avg       , 3,  2, depth);\
3153     dspfunc1(avg_no_rnd, 0, 16, depth);\
3154     dspfunc1(avg_no_rnd, 1,  8, depth);\
3155 \
3156     dspfunc2(put_h264_qpel, 0, 16, depth);\
3157     dspfunc2(put_h264_qpel, 1,  8, depth);\
3158     dspfunc2(put_h264_qpel, 2,  4, depth);\
3159     dspfunc2(put_h264_qpel, 3,  2, depth);\
3160     dspfunc2(avg_h264_qpel, 0, 16, depth);\
3161     dspfunc2(avg_h264_qpel, 1,  8, depth);\
3162     dspfunc2(avg_h264_qpel, 2,  4, depth);
3163
3164     switch (avctx->bits_per_raw_sample) {
3165     case 9:
3166         if (c->dct_bits == 32) {
3167             BIT_DEPTH_FUNCS(9, _32);
3168         } else {
3169             BIT_DEPTH_FUNCS(9, _16);
3170         }
3171         break;
3172     case 10:
3173         if (c->dct_bits == 32) {
3174             BIT_DEPTH_FUNCS(10, _32);
3175         } else {
3176             BIT_DEPTH_FUNCS(10, _16);
3177         }
3178         break;
3179     default:
3180         av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3181     case 8:
3182         BIT_DEPTH_FUNCS(8, _16);
3183         break;
3184     }
3185
3186
3187     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
3188     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
3189     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
3190     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
3191     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
3192     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
3193     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
3194     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
3195     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
3196
3197     for(i=0; i<64; i++){
3198         if(!c->put_2tap_qpel_pixels_tab[0][i])
3199             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3200         if(!c->avg_2tap_qpel_pixels_tab[0][i])
3201             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3202     }
3203
3204     ff_init_scantable_permutation(c->idct_permutation,
3205                                   c->idct_permutation_type);
3206 }