git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41 #include "png.h"
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 #define BIT_DEPTH 9
  47 #include "dsputil_template.c"
  48 #undef BIT_DEPTH
  49
  50 #define BIT_DEPTH 10
  51 #include "dsputil_template.c"
  52 #undef BIT_DEPTH
  53
  54 #define BIT_DEPTH 8
  55 #include "dsputil_template.c"
  56
  57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  58 #define pb_7f (~0UL/255 * 0x7f)
  59 #define pb_80 (~0UL/255 * 0x80)
  60
  61 const uint8_t ff_zigzag_direct[64] = {
  62     0,   1,  8, 16,  9,  2,  3, 10,
  63     17, 24, 32, 25, 18, 11,  4,  5,
  64     12, 19, 26, 33, 40, 48, 41, 34,
  65     27, 20, 13,  6,  7, 14, 21, 28,
  66     35, 42, 49, 56, 57, 50, 43, 36,
  67     29, 22, 15, 23, 30, 37, 44, 51,
  68     58, 59, 52, 45, 38, 31, 39, 46,
  69     53, 60, 61, 54, 47, 55, 62, 63
  70 };
  71
  72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  73    specification, we interleave the fields */
  74 const uint8_t ff_zigzag248_direct[64] = {
  75      0,  8,  1,  9, 16, 24,  2, 10,
  76     17, 25, 32, 40, 48, 56, 33, 41,
  77     18, 26,  3, 11,  4, 12, 19, 27,
  78     34, 42, 49, 57, 50, 58, 35, 43,
  79     20, 28,  5, 13,  6, 14, 21, 29,
  80     36, 44, 51, 59, 52, 60, 37, 45,
  81     22, 30,  7, 15, 23, 31, 38, 46,
  82     53, 61, 54, 62, 39, 47, 55, 63,
  83 };
  84
  85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  86 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  87
  88 const uint8_t ff_alternate_horizontal_scan[64] = {
  89     0,  1,   2,  3,  8,  9, 16, 17,
  90     10, 11,  4,  5,  6,  7, 15, 14,
  91     13, 12, 19, 18, 24, 25, 32, 33,
  92     26, 27, 20, 21, 22, 23, 28, 29,
  93     30, 31, 34, 35, 40, 41, 48, 49,
  94     42, 43, 36, 37, 38, 39, 44, 45,
  95     46, 47, 50, 51, 56, 57, 58, 59,
  96     52, 53, 54, 55, 60, 61, 62, 63,
  97 };
  98
  99 const uint8_t ff_alternate_vertical_scan[64] = {
 100     0,  8,  16, 24,  1,  9,  2, 10,
 101     17, 25, 32, 40, 48, 56, 57, 49,
 102     41, 33, 26, 18,  3, 11,  4, 12,
 103     19, 27, 34, 42, 50, 58, 35, 43,
 104     51, 59, 20, 28,  5, 13,  6, 14,
 105     21, 29, 36, 44, 52, 60, 37, 45,
 106     53, 61, 22, 30,  7, 15, 23, 31,
 107     38, 46, 54, 62, 39, 47, 55, 63,
 108 };
 109
 110 /* Input permutation for the simple_idct_mmx */
 111 static const uint8_t simple_mmx_permutation[64]={
 112         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 113         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 114         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 115         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 116         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 117         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 118         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 119         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 120 };
 121
 122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 123
 124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 125     int i;
 126     int end;
 127
 128     st->scantable= src_scantable;
 129
 130     for(i=0; i<64; i++){
 131         int j;
 132         j = src_scantable[i];
 133         st->permutated[i] = permutation[j];
 134 #if ARCH_PPC
 135         st->inverse[j] = i;
 136 #endif
 137     }
 138
 139     end=-1;
 140     for(i=0; i<64; i++){
 141         int j;
 142         j = st->permutated[i];
 143         if(j>end) end=j;
 144         st->raster_end[i]= end;
 145     }
 146 }
 147
 148 void ff_init_scantable_permutation(uint8_t *idct_permutation,
 149                                    int idct_permutation_type)
 150 {
 151     int i;
 152
 153     switch(idct_permutation_type){
 154     case FF_NO_IDCT_PERM:
 155         for(i=0; i<64; i++)
 156             idct_permutation[i]= i;
 157         break;
 158     case FF_LIBMPEG2_IDCT_PERM:
 159         for(i=0; i<64; i++)
 160             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 161         break;
 162     case FF_SIMPLE_IDCT_PERM:
 163         for(i=0; i<64; i++)
 164             idct_permutation[i]= simple_mmx_permutation[i];
 165         break;
 166     case FF_TRANSPOSE_IDCT_PERM:
 167         for(i=0; i<64; i++)
 168             idct_permutation[i]= ((i&7)<<3) | (i>>3);
 169         break;
 170     case FF_PARTTRANS_IDCT_PERM:
 171         for(i=0; i<64; i++)
 172             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
 173         break;
 174     case FF_SSE2_IDCT_PERM:
 175         for(i=0; i<64; i++)
 176             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
 177         break;
 178     default:
 179         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
 180     }
 181 }
 182
 183 static int pix_sum_c(uint8_t * pix, int line_size)
 184 {
 185     int s, i, j;
 186
 187     s = 0;
 188     for (i = 0; i < 16; i++) {
 189         for (j = 0; j < 16; j += 8) {
 190             s += pix[0];
 191             s += pix[1];
 192             s += pix[2];
 193             s += pix[3];
 194             s += pix[4];
 195             s += pix[5];
 196             s += pix[6];
 197             s += pix[7];
 198             pix += 8;
 199         }
 200         pix += line_size - 16;
 201     }
 202     return s;
 203 }
 204
 205 static int pix_norm1_c(uint8_t * pix, int line_size)
 206 {
 207     int s, i, j;
 208     uint32_t *sq = ff_squareTbl + 256;
 209
 210     s = 0;
 211     for (i = 0; i < 16; i++) {
 212         for (j = 0; j < 16; j += 8) {
 213 #if 0
 214             s += sq[pix[0]];
 215             s += sq[pix[1]];
 216             s += sq[pix[2]];
 217             s += sq[pix[3]];
 218             s += sq[pix[4]];
 219             s += sq[pix[5]];
 220             s += sq[pix[6]];
 221             s += sq[pix[7]];
 222 #else
 223 #if HAVE_FAST_64BIT
 224             register uint64_t x=*(uint64_t*)pix;
 225             s += sq[x&0xff];
 226             s += sq[(x>>8)&0xff];
 227             s += sq[(x>>16)&0xff];
 228             s += sq[(x>>24)&0xff];
 229             s += sq[(x>>32)&0xff];
 230             s += sq[(x>>40)&0xff];
 231             s += sq[(x>>48)&0xff];
 232             s += sq[(x>>56)&0xff];
 233 #else
 234             register uint32_t x=*(uint32_t*)pix;
 235             s += sq[x&0xff];
 236             s += sq[(x>>8)&0xff];
 237             s += sq[(x>>16)&0xff];
 238             s += sq[(x>>24)&0xff];
 239             x=*(uint32_t*)(pix+4);
 240             s += sq[x&0xff];
 241             s += sq[(x>>8)&0xff];
 242             s += sq[(x>>16)&0xff];
 243             s += sq[(x>>24)&0xff];
 244 #endif
 245 #endif
 246             pix += 8;
 247         }
 248         pix += line_size - 16;
 249     }
 250     return s;
 251 }
 252
 253 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 254     int i;
 255
 256     for(i=0; i+8<=w; i+=8){
 257         dst[i+0]= av_bswap32(src[i+0]);
 258         dst[i+1]= av_bswap32(src[i+1]);
 259         dst[i+2]= av_bswap32(src[i+2]);
 260         dst[i+3]= av_bswap32(src[i+3]);
 261         dst[i+4]= av_bswap32(src[i+4]);
 262         dst[i+5]= av_bswap32(src[i+5]);
 263         dst[i+6]= av_bswap32(src[i+6]);
 264         dst[i+7]= av_bswap32(src[i+7]);
 265     }
 266     for(;i<w; i++){
 267         dst[i+0]= av_bswap32(src[i+0]);
 268     }
 269 }
 270
 271 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 272 {
 273     while (len--)
 274         *dst++ = av_bswap16(*src++);
 275 }
 276
 277 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 278 {
 279     int s, i;
 280     uint32_t *sq = ff_squareTbl + 256;
 281
 282     s = 0;
 283     for (i = 0; i < h; i++) {
 284         s += sq[pix1[0] - pix2[0]];
 285         s += sq[pix1[1] - pix2[1]];
 286         s += sq[pix1[2] - pix2[2]];
 287         s += sq[pix1[3] - pix2[3]];
 288         pix1 += line_size;
 289         pix2 += line_size;
 290     }
 291     return s;
 292 }
 293
 294 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 295 {
 296     int s, i;
 297     uint32_t *sq = ff_squareTbl + 256;
 298
 299     s = 0;
 300     for (i = 0; i < h; i++) {
 301         s += sq[pix1[0] - pix2[0]];
 302         s += sq[pix1[1] - pix2[1]];
 303         s += sq[pix1[2] - pix2[2]];
 304         s += sq[pix1[3] - pix2[3]];
 305         s += sq[pix1[4] - pix2[4]];
 306         s += sq[pix1[5] - pix2[5]];
 307         s += sq[pix1[6] - pix2[6]];
 308         s += sq[pix1[7] - pix2[7]];
 309         pix1 += line_size;
 310         pix2 += line_size;
 311     }
 312     return s;
 313 }
 314
 315 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 316 {
 317     int s, i;
 318     uint32_t *sq = ff_squareTbl + 256;
 319
 320     s = 0;
 321     for (i = 0; i < h; i++) {
 322         s += sq[pix1[ 0] - pix2[ 0]];
 323         s += sq[pix1[ 1] - pix2[ 1]];
 324         s += sq[pix1[ 2] - pix2[ 2]];
 325         s += sq[pix1[ 3] - pix2[ 3]];
 326         s += sq[pix1[ 4] - pix2[ 4]];
 327         s += sq[pix1[ 5] - pix2[ 5]];
 328         s += sq[pix1[ 6] - pix2[ 6]];
 329         s += sq[pix1[ 7] - pix2[ 7]];
 330         s += sq[pix1[ 8] - pix2[ 8]];
 331         s += sq[pix1[ 9] - pix2[ 9]];
 332         s += sq[pix1[10] - pix2[10]];
 333         s += sq[pix1[11] - pix2[11]];
 334         s += sq[pix1[12] - pix2[12]];
 335         s += sq[pix1[13] - pix2[13]];
 336         s += sq[pix1[14] - pix2[14]];
 337         s += sq[pix1[15] - pix2[15]];
 338
 339         pix1 += line_size;
 340         pix2 += line_size;
 341     }
 342     return s;
 343 }
 344
 345 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 346                           const uint8_t *s2, int stride){
 347     int i;
 348
 349     /* read the pixels */
 350     for(i=0;i<8;i++) {
 351         block[0] = s1[0] - s2[0];
 352         block[1] = s1[1] - s2[1];
 353         block[2] = s1[2] - s2[2];
 354         block[3] = s1[3] - s2[3];
 355         block[4] = s1[4] - s2[4];
 356         block[5] = s1[5] - s2[5];
 357         block[6] = s1[6] - s2[6];
 358         block[7] = s1[7] - s2[7];
 359         s1 += stride;
 360         s2 += stride;
 361         block += 8;
 362     }
 363 }
 364
 365
 366 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 367                              int line_size)
 368 {
 369     int i;
 370     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 371
 372     /* read the pixels */
 373     for(i=0;i<8;i++) {
 374         pixels[0] = cm[block[0]];
 375         pixels[1] = cm[block[1]];
 376         pixels[2] = cm[block[2]];
 377         pixels[3] = cm[block[3]];
 378         pixels[4] = cm[block[4]];
 379         pixels[5] = cm[block[5]];
 380         pixels[6] = cm[block[6]];
 381         pixels[7] = cm[block[7]];
 382
 383         pixels += line_size;
 384         block += 8;
 385     }
 386 }
 387
 388 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 389                                  int line_size)
 390 {
 391     int i;
 392     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 393
 394     /* read the pixels */
 395     for(i=0;i<4;i++) {
 396         pixels[0] = cm[block[0]];
 397         pixels[1] = cm[block[1]];
 398         pixels[2] = cm[block[2]];
 399         pixels[3] = cm[block[3]];
 400
 401         pixels += line_size;
 402         block += 8;
 403     }
 404 }
 405
 406 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 407                                  int line_size)
 408 {
 409     int i;
 410     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 411
 412     /* read the pixels */
 413     for(i=0;i<2;i++) {
 414         pixels[0] = cm[block[0]];
 415         pixels[1] = cm[block[1]];
 416
 417         pixels += line_size;
 418         block += 8;
 419     }
 420 }
 421
 422 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
 423                                     uint8_t *restrict pixels,
 424                                     int line_size)
 425 {
 426     int i, j;
 427
 428     for (i = 0; i < 8; i++) {
 429         for (j = 0; j < 8; j++) {
 430             if (*block < -128)
 431                 *pixels = 0;
 432             else if (*block > 127)
 433                 *pixels = 255;
 434             else
 435                 *pixels = (uint8_t)(*block + 128);
 436             block++;
 437             pixels++;
 438         }
 439         pixels += (line_size - 8);
 440     }
 441 }
 442
 443 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 444                              int line_size)
 445 {
 446     int i;
 447     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 448
 449     /* read the pixels */
 450     for(i=0;i<8;i++) {
 451         pixels[0] = cm[pixels[0] + block[0]];
 452         pixels[1] = cm[pixels[1] + block[1]];
 453         pixels[2] = cm[pixels[2] + block[2]];
 454         pixels[3] = cm[pixels[3] + block[3]];
 455         pixels[4] = cm[pixels[4] + block[4]];
 456         pixels[5] = cm[pixels[5] + block[5]];
 457         pixels[6] = cm[pixels[6] + block[6]];
 458         pixels[7] = cm[pixels[7] + block[7]];
 459         pixels += line_size;
 460         block += 8;
 461     }
 462 }
 463
 464 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 465                           int line_size)
 466 {
 467     int i;
 468     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 469
 470     /* read the pixels */
 471     for(i=0;i<4;i++) {
 472         pixels[0] = cm[pixels[0] + block[0]];
 473         pixels[1] = cm[pixels[1] + block[1]];
 474         pixels[2] = cm[pixels[2] + block[2]];
 475         pixels[3] = cm[pixels[3] + block[3]];
 476         pixels += line_size;
 477         block += 8;
 478     }
 479 }
 480
 481 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 482                           int line_size)
 483 {
 484     int i;
 485     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 486
 487     /* read the pixels */
 488     for(i=0;i<2;i++) {
 489         pixels[0] = cm[pixels[0] + block[0]];
 490         pixels[1] = cm[pixels[1] + block[1]];
 491         pixels += line_size;
 492         block += 8;
 493     }
 494 }
 495
 496 static int sum_abs_dctelem_c(DCTELEM *block)
 497 {
 498     int sum=0, i;
 499     for(i=0; i<64; i++)
 500         sum+= FFABS(block[i]);
 501     return sum;
 502 }
 503
 504 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 505 {
 506     int i;
 507
 508     for (i = 0; i < h; i++) {
 509         memset(block, value, 16);
 510         block += line_size;
 511     }
 512 }
 513
 514 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 515 {
 516     int i;
 517
 518     for (i = 0; i < h; i++) {
 519         memset(block, value, 8);
 520         block += line_size;
 521     }
 522 }
 523
 524 #define avg2(a,b) ((a+b+1)>>1)
 525 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 526
 527 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 528 {
 529     const int A=(16-x16)*(16-y16);
 530     const int B=(   x16)*(16-y16);
 531     const int C=(16-x16)*(   y16);
 532     const int D=(   x16)*(   y16);
 533     int i;
 534
 535     for(i=0; i<h; i++)
 536     {
 537         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 538         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 539         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 540         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 541         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 542         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 543         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 544         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 545         dst+= stride;
 546         src+= stride;
 547     }
 548 }
 549
 550 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 551                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 552 {
 553     int y, vx, vy;
 554     const int s= 1<<shift;
 555
 556     width--;
 557     height--;
 558
 559     for(y=0; y<h; y++){
 560         int x;
 561
 562         vx= ox;
 563         vy= oy;
 564         for(x=0; x<8; x++){ //XXX FIXME optimize
 565             int src_x, src_y, frac_x, frac_y, index;
 566
 567             src_x= vx>>16;
 568             src_y= vy>>16;
 569             frac_x= src_x&(s-1);
 570             frac_y= src_y&(s-1);
 571             src_x>>=shift;
 572             src_y>>=shift;
 573
 574             if((unsigned)src_x < width){
 575                 if((unsigned)src_y < height){
 576                     index= src_x + src_y*stride;
 577                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 578                                            + src[index       +1]*   frac_x )*(s-frac_y)
 579                                         + (  src[index+stride  ]*(s-frac_x)
 580                                            + src[index+stride+1]*   frac_x )*   frac_y
 581                                         + r)>>(shift*2);
 582                 }else{
 583                     index= src_x + av_clip(src_y, 0, height)*stride;
 584                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 585                                           + src[index       +1]*   frac_x )*s
 586                                         + r)>>(shift*2);
 587                 }
 588             }else{
 589                 if((unsigned)src_y < height){
 590                     index= av_clip(src_x, 0, width) + src_y*stride;
 591                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 592                                            + src[index+stride  ]*   frac_y )*s
 593                                         + r)>>(shift*2);
 594                 }else{
 595                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 596                     dst[y*stride + x]=    src[index         ];
 597                 }
 598             }
 599
 600             vx+= dxx;
 601             vy+= dyx;
 602         }
 603         ox += dxy;
 604         oy += dyy;
 605     }
 606 }
 607
 608 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 609     switch(width){
 610     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 611     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 612     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 613     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 614     }
 615 }
 616
 617 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 618     int i,j;
 619     for (i=0; i < height; i++) {
 620       for (j=0; j < width; j++) {
 621         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 622       }
 623       src += stride;
 624       dst += stride;
 625     }
 626 }
 627
 628 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 629     int i,j;
 630     for (i=0; i < height; i++) {
 631       for (j=0; j < width; j++) {
 632         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 633       }
 634       src += stride;
 635       dst += stride;
 636     }
 637 }
 638
 639 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 640     int i,j;
 641     for (i=0; i < height; i++) {
 642       for (j=0; j < width; j++) {
 643         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 644       }
 645       src += stride;
 646       dst += stride;
 647     }
 648 }
 649
 650 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 651     int i,j;
 652     for (i=0; i < height; i++) {
 653       for (j=0; j < width; j++) {
 654         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 655       }
 656       src += stride;
 657       dst += stride;
 658     }
 659 }
 660
 661 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 662     int i,j;
 663     for (i=0; i < height; i++) {
 664       for (j=0; j < width; j++) {
 665         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 666       }
 667       src += stride;
 668       dst += stride;
 669     }
 670 }
 671
 672 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 673     int i,j;
 674     for (i=0; i < height; i++) {
 675       for (j=0; j < width; j++) {
 676         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 677       }
 678       src += stride;
 679       dst += stride;
 680     }
 681 }
 682
 683 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 684     int i,j;
 685     for (i=0; i < height; i++) {
 686       for (j=0; j < width; j++) {
 687         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 688       }
 689       src += stride;
 690       dst += stride;
 691     }
 692 }
 693
 694 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 695     int i,j;
 696     for (i=0; i < height; i++) {
 697       for (j=0; j < width; j++) {
 698         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 699       }
 700       src += stride;
 701       dst += stride;
 702     }
 703 }
 704
 705 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 706     switch(width){
 707     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 708     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 709     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 710     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 711     }
 712 }
 713
 714 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 715     int i,j;
 716     for (i=0; i < height; i++) {
 717       for (j=0; j < width; j++) {
 718         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 719       }
 720       src += stride;
 721       dst += stride;
 722     }
 723 }
 724
 725 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 726     int i,j;
 727     for (i=0; i < height; i++) {
 728       for (j=0; j < width; j++) {
 729         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 730       }
 731       src += stride;
 732       dst += stride;
 733     }
 734 }
 735
 736 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 737     int i,j;
 738     for (i=0; i < height; i++) {
 739       for (j=0; j < width; j++) {
 740         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 741       }
 742       src += stride;
 743       dst += stride;
 744     }
 745 }
 746
 747 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 748     int i,j;
 749     for (i=0; i < height; i++) {
 750       for (j=0; j < width; j++) {
 751         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 752       }
 753       src += stride;
 754       dst += stride;
 755     }
 756 }
 757
 758 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 759     int i,j;
 760     for (i=0; i < height; i++) {
 761       for (j=0; j < width; j++) {
 762         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 763       }
 764       src += stride;
 765       dst += stride;
 766     }
 767 }
 768
 769 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 770     int i,j;
 771     for (i=0; i < height; i++) {
 772       for (j=0; j < width; j++) {
 773         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 774       }
 775       src += stride;
 776       dst += stride;
 777     }
 778 }
 779
 780 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 781     int i,j;
 782     for (i=0; i < height; i++) {
 783       for (j=0; j < width; j++) {
 784         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 785       }
 786       src += stride;
 787       dst += stride;
 788     }
 789 }
 790
 791 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 792     int i,j;
 793     for (i=0; i < height; i++) {
 794       for (j=0; j < width; j++) {
 795         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 796       }
 797       src += stride;
 798       dst += stride;
 799     }
 800 }
 801
 802 #define QPEL_MC(r, OPNAME, RND, OP) \
 803 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 804     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 805     int i;\
 806     for(i=0; i<h; i++)\
 807     {\
 808         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 809         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 810         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 811         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 812         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 813         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 814         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 815         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 816         dst+=dstStride;\
 817         src+=srcStride;\
 818     }\
 819 }\
 820 \
 821 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 822     const int w=8;\
 823     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 824     int i;\
 825     for(i=0; i<w; i++)\
 826     {\
 827         const int src0= src[0*srcStride];\
 828         const int src1= src[1*srcStride];\
 829         const int src2= src[2*srcStride];\
 830         const int src3= src[3*srcStride];\
 831         const int src4= src[4*srcStride];\
 832         const int src5= src[5*srcStride];\
 833         const int src6= src[6*srcStride];\
 834         const int src7= src[7*srcStride];\
 835         const int src8= src[8*srcStride];\
 836         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 837         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 838         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 839         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 840         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 841         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 842         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 843         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 844         dst++;\
 845         src++;\
 846     }\
 847 }\
 848 \
 849 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 850     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 851     int i;\
 852     \
 853     for(i=0; i<h; i++)\
 854     {\
 855         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 856         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 857         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 858         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 859         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 860         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 861         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 862         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 863         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 864         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 865         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 866         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 867         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 868         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 869         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 870         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 871         dst+=dstStride;\
 872         src+=srcStride;\
 873     }\
 874 }\
 875 \
 876 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 877     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 878     int i;\
 879     const int w=16;\
 880     for(i=0; i<w; i++)\
 881     {\
 882         const int src0= src[0*srcStride];\
 883         const int src1= src[1*srcStride];\
 884         const int src2= src[2*srcStride];\
 885         const int src3= src[3*srcStride];\
 886         const int src4= src[4*srcStride];\
 887         const int src5= src[5*srcStride];\
 888         const int src6= src[6*srcStride];\
 889         const int src7= src[7*srcStride];\
 890         const int src8= src[8*srcStride];\
 891         const int src9= src[9*srcStride];\
 892         const int src10= src[10*srcStride];\
 893         const int src11= src[11*srcStride];\
 894         const int src12= src[12*srcStride];\
 895         const int src13= src[13*srcStride];\
 896         const int src14= src[14*srcStride];\
 897         const int src15= src[15*srcStride];\
 898         const int src16= src[16*srcStride];\
 899         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 900         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 901         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 902         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 903         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 904         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 905         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 906         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 907         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 908         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 909         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 910         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 911         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 912         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 913         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 914         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 915         dst++;\
 916         src++;\
 917     }\
 918 }\
 919 \
 920 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
 921     uint8_t half[64];\
 922     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 923     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 924 }\
 925 \
 926 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
 927     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 928 }\
 929 \
 930 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
 931     uint8_t half[64];\
 932     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 933     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 934 }\
 935 \
 936 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
 937     uint8_t full[16*9];\
 938     uint8_t half[64];\
 939     copy_block9(full, src, 16, stride, 9);\
 940     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 941     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 942 }\
 943 \
 944 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
 945     uint8_t full[16*9];\
 946     copy_block9(full, src, 16, stride, 9);\
 947     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 948 }\
 949 \
 950 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
 951     uint8_t full[16*9];\
 952     uint8_t half[64];\
 953     copy_block9(full, src, 16, stride, 9);\
 954     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 955     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 956 }\
 957 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
 958     uint8_t full[16*9];\
 959     uint8_t halfH[72];\
 960     uint8_t halfV[64];\
 961     uint8_t halfHV[64];\
 962     copy_block9(full, src, 16, stride, 9);\
 963     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 964     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 965     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 966     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 967 }\
 968 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
 969     uint8_t full[16*9];\
 970     uint8_t halfH[72];\
 971     uint8_t halfHV[64];\
 972     copy_block9(full, src, 16, stride, 9);\
 973     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 974     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 975     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 976     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 977 }\
 978 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
 979     uint8_t full[16*9];\
 980     uint8_t halfH[72];\
 981     uint8_t halfV[64];\
 982     uint8_t halfHV[64];\
 983     copy_block9(full, src, 16, stride, 9);\
 984     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 985     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 986     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 987     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 988 }\
 989 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
 990     uint8_t full[16*9];\
 991     uint8_t halfH[72];\
 992     uint8_t halfHV[64];\
 993     copy_block9(full, src, 16, stride, 9);\
 994     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 995     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 996     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 997     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 998 }\
 999 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1000     uint8_t full[16*9];\
1001     uint8_t halfH[72];\
1002     uint8_t halfV[64];\
1003     uint8_t halfHV[64];\
1004     copy_block9(full, src, 16, stride, 9);\
1005     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1006     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1007     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1008     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1009 }\
1010 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1011     uint8_t full[16*9];\
1012     uint8_t halfH[72];\
1013     uint8_t halfHV[64];\
1014     copy_block9(full, src, 16, stride, 9);\
1015     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1017     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1019 }\
1020 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1021     uint8_t full[16*9];\
1022     uint8_t halfH[72];\
1023     uint8_t halfV[64];\
1024     uint8_t halfHV[64];\
1025     copy_block9(full, src, 16, stride, 9);\
1026     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1027     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1028     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1029     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1030 }\
1031 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1032     uint8_t full[16*9];\
1033     uint8_t halfH[72];\
1034     uint8_t halfHV[64];\
1035     copy_block9(full, src, 16, stride, 9);\
1036     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1037     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1038     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1039     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1040 }\
1041 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1042     uint8_t halfH[72];\
1043     uint8_t halfHV[64];\
1044     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1045     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1046     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1047 }\
1048 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1049     uint8_t halfH[72];\
1050     uint8_t halfHV[64];\
1051     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1052     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1053     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1054 }\
1055 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1056     uint8_t full[16*9];\
1057     uint8_t halfH[72];\
1058     uint8_t halfV[64];\
1059     uint8_t halfHV[64];\
1060     copy_block9(full, src, 16, stride, 9);\
1061     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1062     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1063     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1064     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1065 }\
1066 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1067     uint8_t full[16*9];\
1068     uint8_t halfH[72];\
1069     copy_block9(full, src, 16, stride, 9);\
1070     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1071     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1072     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1073 }\
1074 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1075     uint8_t full[16*9];\
1076     uint8_t halfH[72];\
1077     uint8_t halfV[64];\
1078     uint8_t halfHV[64];\
1079     copy_block9(full, src, 16, stride, 9);\
1080     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1081     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1082     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1083     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1084 }\
1085 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1086     uint8_t full[16*9];\
1087     uint8_t halfH[72];\
1088     copy_block9(full, src, 16, stride, 9);\
1089     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1090     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1091     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1092 }\
1093 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1094     uint8_t halfH[72];\
1095     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1096     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1097 }\
1098 \
1099 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1100     uint8_t half[256];\
1101     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1102     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1103 }\
1104 \
1105 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1106     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1107 }\
1108 \
1109 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1110     uint8_t half[256];\
1111     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1112     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1113 }\
1114 \
1115 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1116     uint8_t full[24*17];\
1117     uint8_t half[256];\
1118     copy_block17(full, src, 24, stride, 17);\
1119     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1120     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1121 }\
1122 \
1123 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1124     uint8_t full[24*17];\
1125     copy_block17(full, src, 24, stride, 17);\
1126     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1127 }\
1128 \
1129 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1130     uint8_t full[24*17];\
1131     uint8_t half[256];\
1132     copy_block17(full, src, 24, stride, 17);\
1133     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1134     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1135 }\
1136 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1137     uint8_t full[24*17];\
1138     uint8_t halfH[272];\
1139     uint8_t halfV[256];\
1140     uint8_t halfHV[256];\
1141     copy_block17(full, src, 24, stride, 17);\
1142     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1143     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1144     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1145     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1146 }\
1147 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1148     uint8_t full[24*17];\
1149     uint8_t halfH[272];\
1150     uint8_t halfHV[256];\
1151     copy_block17(full, src, 24, stride, 17);\
1152     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1153     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1154     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1155     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1156 }\
1157 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1158     uint8_t full[24*17];\
1159     uint8_t halfH[272];\
1160     uint8_t halfV[256];\
1161     uint8_t halfHV[256];\
1162     copy_block17(full, src, 24, stride, 17);\
1163     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1164     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1165     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1166     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1167 }\
1168 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1169     uint8_t full[24*17];\
1170     uint8_t halfH[272];\
1171     uint8_t halfHV[256];\
1172     copy_block17(full, src, 24, stride, 17);\
1173     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1174     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1175     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1176     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1177 }\
1178 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1179     uint8_t full[24*17];\
1180     uint8_t halfH[272];\
1181     uint8_t halfV[256];\
1182     uint8_t halfHV[256];\
1183     copy_block17(full, src, 24, stride, 17);\
1184     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1185     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1186     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1187     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1188 }\
1189 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1190     uint8_t full[24*17];\
1191     uint8_t halfH[272];\
1192     uint8_t halfHV[256];\
1193     copy_block17(full, src, 24, stride, 17);\
1194     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1195     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1196     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1197     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1198 }\
1199 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1200     uint8_t full[24*17];\
1201     uint8_t halfH[272];\
1202     uint8_t halfV[256];\
1203     uint8_t halfHV[256];\
1204     copy_block17(full, src, 24, stride, 17);\
1205     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1206     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1207     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1208     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1209 }\
1210 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1211     uint8_t full[24*17];\
1212     uint8_t halfH[272];\
1213     uint8_t halfHV[256];\
1214     copy_block17(full, src, 24, stride, 17);\
1215     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1217     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1219 }\
1220 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1221     uint8_t halfH[272];\
1222     uint8_t halfHV[256];\
1223     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1224     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1225     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1226 }\
1227 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1228     uint8_t halfH[272];\
1229     uint8_t halfHV[256];\
1230     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1231     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1232     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1233 }\
1234 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1235     uint8_t full[24*17];\
1236     uint8_t halfH[272];\
1237     uint8_t halfV[256];\
1238     uint8_t halfHV[256];\
1239     copy_block17(full, src, 24, stride, 17);\
1240     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1241     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1242     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1243     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1244 }\
1245 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1246     uint8_t full[24*17];\
1247     uint8_t halfH[272];\
1248     copy_block17(full, src, 24, stride, 17);\
1249     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1250     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1251     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1252 }\
1253 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1254     uint8_t full[24*17];\
1255     uint8_t halfH[272];\
1256     uint8_t halfV[256];\
1257     uint8_t halfHV[256];\
1258     copy_block17(full, src, 24, stride, 17);\
1259     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1260     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1261     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1262     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1263 }\
1264 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1265     uint8_t full[24*17];\
1266     uint8_t halfH[272];\
1267     copy_block17(full, src, 24, stride, 17);\
1268     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1269     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1270     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1271 }\
1272 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1273     uint8_t halfH[272];\
1274     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1275     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1276 }
1277
1278 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1279 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1280 #define op_put(a, b) a = cm[((b) + 16)>>5]
1281 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1282
1283 QPEL_MC(0, put_       , _       , op_put)
1284 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1285 QPEL_MC(0, avg_       , _       , op_avg)
1286 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1287 #undef op_avg
1288 #undef op_avg_no_rnd
1289 #undef op_put
1290 #undef op_put_no_rnd
1291
1292 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1293 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1294 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1295 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1296 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1297 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1298
1299 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1300     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1301     int i;
1302
1303     for(i=0; i<h; i++){
1304         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1305         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1306         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1307         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1308         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1309         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1310         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1311         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1312         dst+=dstStride;
1313         src+=srcStride;
1314     }
1315 }
1316
1317 #if CONFIG_RV40_DECODER
1318 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1319     put_pixels16_xy2_8_c(dst, src, stride, 16);
1320 }
1321 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1322     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1323 }
1324 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1325     put_pixels8_xy2_8_c(dst, src, stride, 8);
1326 }
1327 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1328     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1329 }
1330 #endif /* CONFIG_RV40_DECODER */
1331
1332 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1333     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1334     int i;
1335
1336     for(i=0; i<w; i++){
1337         const int src_1= src[ -srcStride];
1338         const int src0 = src[0          ];
1339         const int src1 = src[  srcStride];
1340         const int src2 = src[2*srcStride];
1341         const int src3 = src[3*srcStride];
1342         const int src4 = src[4*srcStride];
1343         const int src5 = src[5*srcStride];
1344         const int src6 = src[6*srcStride];
1345         const int src7 = src[7*srcStride];
1346         const int src8 = src[8*srcStride];
1347         const int src9 = src[9*srcStride];
1348         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1349         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1350         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1351         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1352         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1353         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1354         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1355         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1356         src++;
1357         dst++;
1358     }
1359 }
1360
1361 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1362     uint8_t half[64];
1363     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1364     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1365 }
1366
1367 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1368     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1369 }
1370
1371 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1372     uint8_t half[64];
1373     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1374     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1375 }
1376
1377 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1378     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1379 }
1380
1381 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1382     uint8_t halfH[88];
1383     uint8_t halfV[64];
1384     uint8_t halfHV[64];
1385     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1386     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1387     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1388     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1389 }
1390 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1391     uint8_t halfH[88];
1392     uint8_t halfV[64];
1393     uint8_t halfHV[64];
1394     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1395     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1396     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1397     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1398 }
1399 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1400     uint8_t halfH[88];
1401     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1402     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1403 }
1404
1405 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1406     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1407     int x;
1408     const int strength= ff_h263_loop_filter_strength[qscale];
1409
1410     for(x=0; x<8; x++){
1411         int d1, d2, ad1;
1412         int p0= src[x-2*stride];
1413         int p1= src[x-1*stride];
1414         int p2= src[x+0*stride];
1415         int p3= src[x+1*stride];
1416         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1417
1418         if     (d<-2*strength) d1= 0;
1419         else if(d<-  strength) d1=-2*strength - d;
1420         else if(d<   strength) d1= d;
1421         else if(d< 2*strength) d1= 2*strength - d;
1422         else                   d1= 0;
1423
1424         p1 += d1;
1425         p2 -= d1;
1426         if(p1&256) p1= ~(p1>>31);
1427         if(p2&256) p2= ~(p2>>31);
1428
1429         src[x-1*stride] = p1;
1430         src[x+0*stride] = p2;
1431
1432         ad1= FFABS(d1)>>1;
1433
1434         d2= av_clip((p0-p3)/4, -ad1, ad1);
1435
1436         src[x-2*stride] = p0 - d2;
1437         src[x+  stride] = p3 + d2;
1438     }
1439     }
1440 }
1441
1442 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1443     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1444     int y;
1445     const int strength= ff_h263_loop_filter_strength[qscale];
1446
1447     for(y=0; y<8; y++){
1448         int d1, d2, ad1;
1449         int p0= src[y*stride-2];
1450         int p1= src[y*stride-1];
1451         int p2= src[y*stride+0];
1452         int p3= src[y*stride+1];
1453         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1454
1455         if     (d<-2*strength) d1= 0;
1456         else if(d<-  strength) d1=-2*strength - d;
1457         else if(d<   strength) d1= d;
1458         else if(d< 2*strength) d1= 2*strength - d;
1459         else                   d1= 0;
1460
1461         p1 += d1;
1462         p2 -= d1;
1463         if(p1&256) p1= ~(p1>>31);
1464         if(p2&256) p2= ~(p2>>31);
1465
1466         src[y*stride-1] = p1;
1467         src[y*stride+0] = p2;
1468
1469         ad1= FFABS(d1)>>1;
1470
1471         d2= av_clip((p0-p3)/4, -ad1, ad1);
1472
1473         src[y*stride-2] = p0 - d2;
1474         src[y*stride+1] = p3 + d2;
1475     }
1476     }
1477 }
1478
1479 static void h261_loop_filter_c(uint8_t *src, int stride){
1480     int x,y,xy,yz;
1481     int temp[64];
1482
1483     for(x=0; x<8; x++){
1484         temp[x      ] = 4*src[x           ];
1485         temp[x + 7*8] = 4*src[x + 7*stride];
1486     }
1487     for(y=1; y<7; y++){
1488         for(x=0; x<8; x++){
1489             xy = y * stride + x;
1490             yz = y * 8 + x;
1491             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1492         }
1493     }
1494
1495     for(y=0; y<8; y++){
1496         src[  y*stride] = (temp[  y*8] + 2)>>2;
1497         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1498         for(x=1; x<7; x++){
1499             xy = y * stride + x;
1500             yz = y * 8 + x;
1501             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1502         }
1503     }
1504 }
1505
1506 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1507 {
1508     int s, i;
1509
1510     s = 0;
1511     for(i=0;i<h;i++) {
1512         s += abs(pix1[0] - pix2[0]);
1513         s += abs(pix1[1] - pix2[1]);
1514         s += abs(pix1[2] - pix2[2]);
1515         s += abs(pix1[3] - pix2[3]);
1516         s += abs(pix1[4] - pix2[4]);
1517         s += abs(pix1[5] - pix2[5]);
1518         s += abs(pix1[6] - pix2[6]);
1519         s += abs(pix1[7] - pix2[7]);
1520         s += abs(pix1[8] - pix2[8]);
1521         s += abs(pix1[9] - pix2[9]);
1522         s += abs(pix1[10] - pix2[10]);
1523         s += abs(pix1[11] - pix2[11]);
1524         s += abs(pix1[12] - pix2[12]);
1525         s += abs(pix1[13] - pix2[13]);
1526         s += abs(pix1[14] - pix2[14]);
1527         s += abs(pix1[15] - pix2[15]);
1528         pix1 += line_size;
1529         pix2 += line_size;
1530     }
1531     return s;
1532 }
1533
1534 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1535 {
1536     int s, i;
1537
1538     s = 0;
1539     for(i=0;i<h;i++) {
1540         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1541         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1542         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1543         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1544         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1545         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1546         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1547         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1548         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1549         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1550         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1551         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1552         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1553         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1554         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1555         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1556         pix1 += line_size;
1557         pix2 += line_size;
1558     }
1559     return s;
1560 }
1561
1562 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1563 {
1564     int s, i;
1565     uint8_t *pix3 = pix2 + line_size;
1566
1567     s = 0;
1568     for(i=0;i<h;i++) {
1569         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1570         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1571         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1572         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1573         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1574         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1575         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1576         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1577         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1578         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1579         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1580         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1581         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1582         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1583         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1584         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1585         pix1 += line_size;
1586         pix2 += line_size;
1587         pix3 += line_size;
1588     }
1589     return s;
1590 }
1591
1592 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1593 {
1594     int s, i;
1595     uint8_t *pix3 = pix2 + line_size;
1596
1597     s = 0;
1598     for(i=0;i<h;i++) {
1599         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1600         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1601         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1602         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1603         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1604         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1605         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1606         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1607         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1608         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1609         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1610         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1611         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1612         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1613         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1614         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1615         pix1 += line_size;
1616         pix2 += line_size;
1617         pix3 += line_size;
1618     }
1619     return s;
1620 }
1621
1622 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1623 {
1624     int s, i;
1625
1626     s = 0;
1627     for(i=0;i<h;i++) {
1628         s += abs(pix1[0] - pix2[0]);
1629         s += abs(pix1[1] - pix2[1]);
1630         s += abs(pix1[2] - pix2[2]);
1631         s += abs(pix1[3] - pix2[3]);
1632         s += abs(pix1[4] - pix2[4]);
1633         s += abs(pix1[5] - pix2[5]);
1634         s += abs(pix1[6] - pix2[6]);
1635         s += abs(pix1[7] - pix2[7]);
1636         pix1 += line_size;
1637         pix2 += line_size;
1638     }
1639     return s;
1640 }
1641
1642 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1643 {
1644     int s, i;
1645
1646     s = 0;
1647     for(i=0;i<h;i++) {
1648         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1649         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1650         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1651         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1652         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1653         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1654         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1655         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1656         pix1 += line_size;
1657         pix2 += line_size;
1658     }
1659     return s;
1660 }
1661
1662 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1663 {
1664     int s, i;
1665     uint8_t *pix3 = pix2 + line_size;
1666
1667     s = 0;
1668     for(i=0;i<h;i++) {
1669         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1670         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1671         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1672         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1673         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1674         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1675         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1676         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1677         pix1 += line_size;
1678         pix2 += line_size;
1679         pix3 += line_size;
1680     }
1681     return s;
1682 }
1683
1684 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1685 {
1686     int s, i;
1687     uint8_t *pix3 = pix2 + line_size;
1688
1689     s = 0;
1690     for(i=0;i<h;i++) {
1691         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1692         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1693         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1694         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1695         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1696         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1697         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1698         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1699         pix1 += line_size;
1700         pix2 += line_size;
1701         pix3 += line_size;
1702     }
1703     return s;
1704 }
1705
1706 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1707     MpegEncContext *c = v;
1708     int score1=0;
1709     int score2=0;
1710     int x,y;
1711
1712     for(y=0; y<h; y++){
1713         for(x=0; x<16; x++){
1714             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1715         }
1716         if(y+1<h){
1717             for(x=0; x<15; x++){
1718                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1719                              - s1[x+1] + s1[x+1+stride])
1720                         -FFABS(  s2[x  ] - s2[x  +stride]
1721                              - s2[x+1] + s2[x+1+stride]);
1722             }
1723         }
1724         s1+= stride;
1725         s2+= stride;
1726     }
1727
1728     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1729     else  return score1 + FFABS(score2)*8;
1730 }
1731
1732 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1733     MpegEncContext *c = v;
1734     int score1=0;
1735     int score2=0;
1736     int x,y;
1737
1738     for(y=0; y<h; y++){
1739         for(x=0; x<8; x++){
1740             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1741         }
1742         if(y+1<h){
1743             for(x=0; x<7; x++){
1744                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1745                              - s1[x+1] + s1[x+1+stride])
1746                         -FFABS(  s2[x  ] - s2[x  +stride]
1747                              - s2[x+1] + s2[x+1+stride]);
1748             }
1749         }
1750         s1+= stride;
1751         s2+= stride;
1752     }
1753
1754     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1755     else  return score1 + FFABS(score2)*8;
1756 }
1757
1758 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1759     int i;
1760     unsigned int sum=0;
1761
1762     for(i=0; i<8*8; i++){
1763         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1764         int w= weight[i];
1765         b>>= RECON_SHIFT;
1766         assert(-512<b && b<512);
1767
1768         sum += (w*b)*(w*b)>>4;
1769     }
1770     return sum>>2;
1771 }
1772
1773 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1774     int i;
1775
1776     for(i=0; i<8*8; i++){
1777         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1778     }
1779 }
1780
1781 /**
1782  * permutes an 8x8 block.
1783  * @param block the block which will be permuted according to the given permutation vector
1784  * @param permutation the permutation vector
1785  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1786  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1787  *                  (inverse) permutated to scantable order!
1788  */
1789 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1790 {
1791     int i;
1792     DCTELEM temp[64];
1793
1794     if(last<=0) return;
1795     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1796
1797     for(i=0; i<=last; i++){
1798         const int j= scantable[i];
1799         temp[j]= block[j];
1800         block[j]=0;
1801     }
1802
1803     for(i=0; i<=last; i++){
1804         const int j= scantable[i];
1805         const int perm_j= permutation[j];
1806         block[perm_j]= temp[j];
1807     }
1808 }
1809
1810 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1811     return 0;
1812 }
1813
1814 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1815     int i;
1816
1817     memset(cmp, 0, sizeof(void*)*6);
1818
1819     for(i=0; i<6; i++){
1820         switch(type&0xFF){
1821         case FF_CMP_SAD:
1822             cmp[i]= c->sad[i];
1823             break;
1824         case FF_CMP_SATD:
1825             cmp[i]= c->hadamard8_diff[i];
1826             break;
1827         case FF_CMP_SSE:
1828             cmp[i]= c->sse[i];
1829             break;
1830         case FF_CMP_DCT:
1831             cmp[i]= c->dct_sad[i];
1832             break;
1833         case FF_CMP_DCT264:
1834             cmp[i]= c->dct264_sad[i];
1835             break;
1836         case FF_CMP_DCTMAX:
1837             cmp[i]= c->dct_max[i];
1838             break;
1839         case FF_CMP_PSNR:
1840             cmp[i]= c->quant_psnr[i];
1841             break;
1842         case FF_CMP_BIT:
1843             cmp[i]= c->bit[i];
1844             break;
1845         case FF_CMP_RD:
1846             cmp[i]= c->rd[i];
1847             break;
1848         case FF_CMP_VSAD:
1849             cmp[i]= c->vsad[i];
1850             break;
1851         case FF_CMP_VSSE:
1852             cmp[i]= c->vsse[i];
1853             break;
1854         case FF_CMP_ZERO:
1855             cmp[i]= zero_cmp;
1856             break;
1857         case FF_CMP_NSSE:
1858             cmp[i]= c->nsse[i];
1859             break;
1860 #if CONFIG_DWT
1861         case FF_CMP_W53:
1862             cmp[i]= c->w53[i];
1863             break;
1864         case FF_CMP_W97:
1865             cmp[i]= c->w97[i];
1866             break;
1867 #endif
1868         default:
1869             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1870         }
1871     }
1872 }
1873
1874 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1875     long i;
1876     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1877         long a = *(long*)(src+i);
1878         long b = *(long*)(dst+i);
1879         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1880     }
1881     for(; i<w; i++)
1882         dst[i+0] += src[i+0];
1883 }
1884
1885 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1886     long i;
1887     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1888         long a = *(long*)(src1+i);
1889         long b = *(long*)(src2+i);
1890         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1891     }
1892     for(; i<w; i++)
1893         dst[i] = src1[i]+src2[i];
1894 }
1895
1896 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1897     long i;
1898 #if !HAVE_FAST_UNALIGNED
1899     if((long)src2 & (sizeof(long)-1)){
1900         for(i=0; i+7<w; i+=8){
1901             dst[i+0] = src1[i+0]-src2[i+0];
1902             dst[i+1] = src1[i+1]-src2[i+1];
1903             dst[i+2] = src1[i+2]-src2[i+2];
1904             dst[i+3] = src1[i+3]-src2[i+3];
1905             dst[i+4] = src1[i+4]-src2[i+4];
1906             dst[i+5] = src1[i+5]-src2[i+5];
1907             dst[i+6] = src1[i+6]-src2[i+6];
1908             dst[i+7] = src1[i+7]-src2[i+7];
1909         }
1910     }else
1911 #endif
1912     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1913         long a = *(long*)(src1+i);
1914         long b = *(long*)(src2+i);
1915         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1916     }
1917     for(; i<w; i++)
1918         dst[i+0] = src1[i+0]-src2[i+0];
1919 }
1920
1921 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1922     int i;
1923     uint8_t l, lt;
1924
1925     l= *left;
1926     lt= *left_top;
1927
1928     for(i=0; i<w; i++){
1929         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1930         lt= src1[i];
1931         dst[i]= l;
1932     }
1933
1934     *left= l;
1935     *left_top= lt;
1936 }
1937
1938 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1939     int i;
1940     uint8_t l, lt;
1941
1942     l= *left;
1943     lt= *left_top;
1944
1945     for(i=0; i<w; i++){
1946         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1947         lt= src1[i];
1948         l= src2[i];
1949         dst[i]= l - pred;
1950     }
1951
1952     *left= l;
1953     *left_top= lt;
1954 }
1955
1956 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1957     int i;
1958
1959     for(i=0; i<w-1; i++){
1960         acc+= src[i];
1961         dst[i]= acc;
1962         i++;
1963         acc+= src[i];
1964         dst[i]= acc;
1965     }
1966
1967     for(; i<w; i++){
1968         acc+= src[i];
1969         dst[i]= acc;
1970     }
1971
1972     return acc;
1973 }
1974
1975 #if HAVE_BIGENDIAN
1976 #define B 3
1977 #define G 2
1978 #define R 1
1979 #define A 0
1980 #else
1981 #define B 0
1982 #define G 1
1983 #define R 2
1984 #define A 3
1985 #endif
1986 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1987     int i;
1988     int r,g,b,a;
1989     r= *red;
1990     g= *green;
1991     b= *blue;
1992     a= *alpha;
1993
1994     for(i=0; i<w; i++){
1995         b+= src[4*i+B];
1996         g+= src[4*i+G];
1997         r+= src[4*i+R];
1998         a+= src[4*i+A];
1999
2000         dst[4*i+B]= b;
2001         dst[4*i+G]= g;
2002         dst[4*i+R]= r;
2003         dst[4*i+A]= a;
2004     }
2005
2006     *red= r;
2007     *green= g;
2008     *blue= b;
2009     *alpha= a;
2010 }
2011 #undef B
2012 #undef G
2013 #undef R
2014 #undef A
2015
2016 #define BUTTERFLY2(o1,o2,i1,i2) \
2017 o1= (i1)+(i2);\
2018 o2= (i1)-(i2);
2019
2020 #define BUTTERFLY1(x,y) \
2021 {\
2022     int a,b;\
2023     a= x;\
2024     b= y;\
2025     x= a+b;\
2026     y= a-b;\
2027 }
2028
2029 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2030
2031 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2032     int i;
2033     int temp[64];
2034     int sum=0;
2035
2036     assert(h==8);
2037
2038     for(i=0; i<8; i++){
2039         //FIXME try pointer walks
2040         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2041         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2042         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2043         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2044
2045         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2046         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2047         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2048         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2049
2050         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2051         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2052         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2053         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2054     }
2055
2056     for(i=0; i<8; i++){
2057         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2058         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2059         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2060         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2061
2062         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2063         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2064         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2065         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2066
2067         sum +=
2068              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2069             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2070             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2071             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2072     }
2073     return sum;
2074 }
2075
2076 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2077     int i;
2078     int temp[64];
2079     int sum=0;
2080
2081     assert(h==8);
2082
2083     for(i=0; i<8; i++){
2084         //FIXME try pointer walks
2085         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2086         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2087         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2088         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2089
2090         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2091         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2092         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2093         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2094
2095         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2096         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2097         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2098         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2099     }
2100
2101     for(i=0; i<8; i++){
2102         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2103         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2104         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2105         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2106
2107         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2108         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2109         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2110         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2111
2112         sum +=
2113              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2114             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2115             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2116             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2117     }
2118
2119     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2120
2121     return sum;
2122 }
2123
2124 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2125     MpegEncContext * const s= (MpegEncContext *)c;
2126     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2127
2128     assert(h==8);
2129
2130     s->dsp.diff_pixels(temp, src1, src2, stride);
2131     s->dsp.fdct(temp);
2132     return s->dsp.sum_abs_dctelem(temp);
2133 }
2134
2135 #if CONFIG_GPL
2136 #define DCT8_1D {\
2137     const int s07 = SRC(0) + SRC(7);\
2138     const int s16 = SRC(1) + SRC(6);\
2139     const int s25 = SRC(2) + SRC(5);\
2140     const int s34 = SRC(3) + SRC(4);\
2141     const int a0 = s07 + s34;\
2142     const int a1 = s16 + s25;\
2143     const int a2 = s07 - s34;\
2144     const int a3 = s16 - s25;\
2145     const int d07 = SRC(0) - SRC(7);\
2146     const int d16 = SRC(1) - SRC(6);\
2147     const int d25 = SRC(2) - SRC(5);\
2148     const int d34 = SRC(3) - SRC(4);\
2149     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2150     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2151     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2152     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2153     DST(0,  a0 + a1     ) ;\
2154     DST(1,  a4 + (a7>>2)) ;\
2155     DST(2,  a2 + (a3>>1)) ;\
2156     DST(3,  a5 + (a6>>2)) ;\
2157     DST(4,  a0 - a1     ) ;\
2158     DST(5,  a6 - (a5>>2)) ;\
2159     DST(6, (a2>>1) - a3 ) ;\
2160     DST(7, (a4>>2) - a7 ) ;\
2161 }
2162
2163 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2164     MpegEncContext * const s= (MpegEncContext *)c;
2165     DCTELEM dct[8][8];
2166     int i;
2167     int sum=0;
2168
2169     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2170
2171 #define SRC(x) dct[i][x]
2172 #define DST(x,v) dct[i][x]= v
2173     for( i = 0; i < 8; i++ )
2174         DCT8_1D
2175 #undef SRC
2176 #undef DST
2177
2178 #define SRC(x) dct[x][i]
2179 #define DST(x,v) sum += FFABS(v)
2180     for( i = 0; i < 8; i++ )
2181         DCT8_1D
2182 #undef SRC
2183 #undef DST
2184     return sum;
2185 }
2186 #endif
2187
2188 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2189     MpegEncContext * const s= (MpegEncContext *)c;
2190     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2191     int sum=0, i;
2192
2193     assert(h==8);
2194
2195     s->dsp.diff_pixels(temp, src1, src2, stride);
2196     s->dsp.fdct(temp);
2197
2198     for(i=0; i<64; i++)
2199         sum= FFMAX(sum, FFABS(temp[i]));
2200
2201     return sum;
2202 }
2203
2204 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2205     MpegEncContext * const s= (MpegEncContext *)c;
2206     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2207     DCTELEM * const bak = temp+64;
2208     int sum=0, i;
2209
2210     assert(h==8);
2211     s->mb_intra=0;
2212
2213     s->dsp.diff_pixels(temp, src1, src2, stride);
2214
2215     memcpy(bak, temp, 64*sizeof(DCTELEM));
2216
2217     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2218     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2219     ff_simple_idct_8(temp); //FIXME
2220
2221     for(i=0; i<64; i++)
2222         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2223
2224     return sum;
2225 }
2226
2227 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2228     MpegEncContext * const s= (MpegEncContext *)c;
2229     const uint8_t *scantable= s->intra_scantable.permutated;
2230     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2231     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2232     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2233     int i, last, run, bits, level, distortion, start_i;
2234     const int esc_length= s->ac_esc_length;
2235     uint8_t * length;
2236     uint8_t * last_length;
2237
2238     assert(h==8);
2239
2240     copy_block8(lsrc1, src1, 8, stride, 8);
2241     copy_block8(lsrc2, src2, 8, stride, 8);
2242
2243     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2244
2245     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2246
2247     bits=0;
2248
2249     if (s->mb_intra) {
2250         start_i = 1;
2251         length     = s->intra_ac_vlc_length;
2252         last_length= s->intra_ac_vlc_last_length;
2253         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2254     } else {
2255         start_i = 0;
2256         length     = s->inter_ac_vlc_length;
2257         last_length= s->inter_ac_vlc_last_length;
2258     }
2259
2260     if(last>=start_i){
2261         run=0;
2262         for(i=start_i; i<last; i++){
2263             int j= scantable[i];
2264             level= temp[j];
2265
2266             if(level){
2267                 level+=64;
2268                 if((level&(~127)) == 0){
2269                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2270                 }else
2271                     bits+= esc_length;
2272                 run=0;
2273             }else
2274                 run++;
2275         }
2276         i= scantable[last];
2277
2278         level= temp[i] + 64;
2279
2280         assert(level - 64);
2281
2282         if((level&(~127)) == 0){
2283             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2284         }else
2285             bits+= esc_length;
2286
2287     }
2288
2289     if(last>=0){
2290         if(s->mb_intra)
2291             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2292         else
2293             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2294     }
2295
2296     s->dsp.idct_add(lsrc2, 8, temp);
2297
2298     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2299
2300     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2301 }
2302
2303 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2304     MpegEncContext * const s= (MpegEncContext *)c;
2305     const uint8_t *scantable= s->intra_scantable.permutated;
2306     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2307     int i, last, run, bits, level, start_i;
2308     const int esc_length= s->ac_esc_length;
2309     uint8_t * length;
2310     uint8_t * last_length;
2311
2312     assert(h==8);
2313
2314     s->dsp.diff_pixels(temp, src1, src2, stride);
2315
2316     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2317
2318     bits=0;
2319
2320     if (s->mb_intra) {
2321         start_i = 1;
2322         length     = s->intra_ac_vlc_length;
2323         last_length= s->intra_ac_vlc_last_length;
2324         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2325     } else {
2326         start_i = 0;
2327         length     = s->inter_ac_vlc_length;
2328         last_length= s->inter_ac_vlc_last_length;
2329     }
2330
2331     if(last>=start_i){
2332         run=0;
2333         for(i=start_i; i<last; i++){
2334             int j= scantable[i];
2335             level= temp[j];
2336
2337             if(level){
2338                 level+=64;
2339                 if((level&(~127)) == 0){
2340                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2341                 }else
2342                     bits+= esc_length;
2343                 run=0;
2344             }else
2345                 run++;
2346         }
2347         i= scantable[last];
2348
2349         level= temp[i] + 64;
2350
2351         assert(level - 64);
2352
2353         if((level&(~127)) == 0){
2354             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2355         }else
2356             bits+= esc_length;
2357     }
2358
2359     return bits;
2360 }
2361
2362 #define VSAD_INTRA(size) \
2363 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2364     int score=0;                                                                                            \
2365     int x,y;                                                                                                \
2366                                                                                                             \
2367     for(y=1; y<h; y++){                                                                                     \
2368         for(x=0; x<size; x+=4){                                                                             \
2369             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2370                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2371         }                                                                                                   \
2372         s+= stride;                                                                                         \
2373     }                                                                                                       \
2374                                                                                                             \
2375     return score;                                                                                           \
2376 }
2377 VSAD_INTRA(8)
2378 VSAD_INTRA(16)
2379
2380 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2381     int score=0;
2382     int x,y;
2383
2384     for(y=1; y<h; y++){
2385         for(x=0; x<16; x++){
2386             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2387         }
2388         s1+= stride;
2389         s2+= stride;
2390     }
2391
2392     return score;
2393 }
2394
2395 #define SQ(a) ((a)*(a))
2396 #define VSSE_INTRA(size) \
2397 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2398     int score=0;                                                                                            \
2399     int x,y;                                                                                                \
2400                                                                                                             \
2401     for(y=1; y<h; y++){                                                                                     \
2402         for(x=0; x<size; x+=4){                                                                               \
2403             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2404                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2405         }                                                                                                   \
2406         s+= stride;                                                                                         \
2407     }                                                                                                       \
2408                                                                                                             \
2409     return score;                                                                                           \
2410 }
2411 VSSE_INTRA(8)
2412 VSSE_INTRA(16)
2413
2414 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2415     int score=0;
2416     int x,y;
2417
2418     for(y=1; y<h; y++){
2419         for(x=0; x<16; x++){
2420             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2421         }
2422         s1+= stride;
2423         s2+= stride;
2424     }
2425
2426     return score;
2427 }
2428
2429 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2430                                int size){
2431     int score=0;
2432     int i;
2433     for(i=0; i<size; i++)
2434         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2435     return score;
2436 }
2437
2438 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2439 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2440 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2441 #if CONFIG_GPL
2442 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2443 #endif
2444 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2445 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2446 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2447 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2448
2449 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2450     int i;
2451     for(i=0; i<len; i++)
2452         dst[i] = src0[i] * src1[i];
2453 }
2454
2455 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2456     int i;
2457     src1 += len-1;
2458     for(i=0; i<len; i++)
2459         dst[i] = src0[i] * src1[-i];
2460 }
2461
2462 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2463     int i;
2464     for(i=0; i<len; i++)
2465         dst[i] = src0[i] * src1[i] + src2[i];
2466 }
2467
2468 static void vector_fmul_window_c(float *dst, const float *src0,
2469                                  const float *src1, const float *win, int len)
2470 {
2471     int i,j;
2472     dst += len;
2473     win += len;
2474     src0+= len;
2475     for(i=-len, j=len-1; i<0; i++, j--) {
2476         float s0 = src0[i];
2477         float s1 = src1[j];
2478         float wi = win[i];
2479         float wj = win[j];
2480         dst[i] = s0*wj - s1*wi;
2481         dst[j] = s0*wi + s1*wj;
2482     }
2483 }
2484
2485 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2486                                  int len)
2487 {
2488     int i;
2489     for (i = 0; i < len; i++)
2490         dst[i] = src[i] * mul;
2491 }
2492
2493 static void vector_fmac_scalar_c(float *dst, const float *src, float mul,
2494                                  int len)
2495 {
2496     int i;
2497     for (i = 0; i < len; i++)
2498         dst[i] += src[i] * mul;
2499 }
2500
2501 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2502                                 int len)
2503 {
2504     int i;
2505     for (i = 0; i < len; i++) {
2506         float t = v1[i] - v2[i];
2507         v1[i] += v2[i];
2508         v2[i] = t;
2509     }
2510 }
2511
2512 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2513 {
2514     float p = 0.0;
2515     int i;
2516
2517     for (i = 0; i < len; i++)
2518         p += v1[i] * v2[i];
2519
2520     return p;
2521 }
2522
2523 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2524                    uint32_t maxi, uint32_t maxisign)
2525 {
2526
2527     if(a > mini) return mini;
2528     else if((a^(1U<<31)) > maxisign) return maxi;
2529     else return a;
2530 }
2531
2532 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2533     int i;
2534     uint32_t mini = *(uint32_t*)min;
2535     uint32_t maxi = *(uint32_t*)max;
2536     uint32_t maxisign = maxi ^ (1U<<31);
2537     uint32_t *dsti = (uint32_t*)dst;
2538     const uint32_t *srci = (const uint32_t*)src;
2539     for(i=0; i<len; i+=8) {
2540         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2541         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2542         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2543         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2544         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2545         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2546         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2547         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2548     }
2549 }
2550 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2551     int i;
2552     if(min < 0 && max > 0) {
2553         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2554     } else {
2555         for(i=0; i < len; i+=8) {
2556             dst[i    ] = av_clipf(src[i    ], min, max);
2557             dst[i + 1] = av_clipf(src[i + 1], min, max);
2558             dst[i + 2] = av_clipf(src[i + 2], min, max);
2559             dst[i + 3] = av_clipf(src[i + 3], min, max);
2560             dst[i + 4] = av_clipf(src[i + 4], min, max);
2561             dst[i + 5] = av_clipf(src[i + 5], min, max);
2562             dst[i + 6] = av_clipf(src[i + 6], min, max);
2563             dst[i + 7] = av_clipf(src[i + 7], min, max);
2564         }
2565     }
2566 }
2567
2568 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2569 {
2570     int res = 0;
2571
2572     while (order--)
2573         res += (*v1++ * *v2++) >> shift;
2574
2575     return res;
2576 }
2577
2578 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2579 {
2580     int res = 0;
2581     while (order--) {
2582         res   += *v1 * *v2++;
2583         *v1++ += mul * *v3++;
2584     }
2585     return res;
2586 }
2587
2588 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2589                                  const int16_t *window, unsigned int len)
2590 {
2591     int i;
2592     int len2 = len >> 1;
2593
2594     for (i = 0; i < len2; i++) {
2595         int16_t w       = window[i];
2596         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2597         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2598     }
2599 }
2600
2601 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2602                                 int32_t max, unsigned int len)
2603 {
2604     do {
2605         *dst++ = av_clip(*src++, min, max);
2606         *dst++ = av_clip(*src++, min, max);
2607         *dst++ = av_clip(*src++, min, max);
2608         *dst++ = av_clip(*src++, min, max);
2609         *dst++ = av_clip(*src++, min, max);
2610         *dst++ = av_clip(*src++, min, max);
2611         *dst++ = av_clip(*src++, min, max);
2612         *dst++ = av_clip(*src++, min, max);
2613         len -= 8;
2614     } while (len > 0);
2615 }
2616
2617 #define W0 2048
2618 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2619 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2620 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2621 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2622 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2623 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2624 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2625
2626 static void wmv2_idct_row(short * b)
2627 {
2628     int s1,s2;
2629     int a0,a1,a2,a3,a4,a5,a6,a7;
2630     /*step 1*/
2631     a1 = W1*b[1]+W7*b[7];
2632     a7 = W7*b[1]-W1*b[7];
2633     a5 = W5*b[5]+W3*b[3];
2634     a3 = W3*b[5]-W5*b[3];
2635     a2 = W2*b[2]+W6*b[6];
2636     a6 = W6*b[2]-W2*b[6];
2637     a0 = W0*b[0]+W0*b[4];
2638     a4 = W0*b[0]-W0*b[4];
2639     /*step 2*/
2640     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2641     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2642     /*step 3*/
2643     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2644     b[1] = (a4+a6 +s1   + (1<<7))>>8;
2645     b[2] = (a4-a6 +s2   + (1<<7))>>8;
2646     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2647     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2648     b[5] = (a4-a6 -s2   + (1<<7))>>8;
2649     b[6] = (a4+a6 -s1   + (1<<7))>>8;
2650     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2651 }
2652 static void wmv2_idct_col(short * b)
2653 {
2654     int s1,s2;
2655     int a0,a1,a2,a3,a4,a5,a6,a7;
2656     /*step 1, with extended precision*/
2657     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2658     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2659     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2660     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2661     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2662     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2663     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2664     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2665     /*step 2*/
2666     s1 = (181*(a1-a5+a7-a3)+128)>>8;
2667     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2668     /*step 3*/
2669     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2670     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2671     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2672     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2673
2674     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2675     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2676     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2677     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2678 }
2679 void ff_wmv2_idct_c(short * block){
2680     int i;
2681
2682     for(i=0;i<64;i+=8){
2683         wmv2_idct_row(block+i);
2684     }
2685     for(i=0;i<8;i++){
2686         wmv2_idct_col(block+i);
2687     }
2688 }
2689 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2690  converted */
2691 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2692 {
2693     ff_wmv2_idct_c(block);
2694     ff_put_pixels_clamped_c(block, dest, line_size);
2695 }
2696 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2697 {
2698     ff_wmv2_idct_c(block);
2699     ff_add_pixels_clamped_c(block, dest, line_size);
2700 }
2701 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2702 {
2703     j_rev_dct (block);
2704     ff_put_pixels_clamped_c(block, dest, line_size);
2705 }
2706 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2707 {
2708     j_rev_dct (block);
2709     ff_add_pixels_clamped_c(block, dest, line_size);
2710 }
2711
2712 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2713 {
2714     j_rev_dct4 (block);
2715     put_pixels_clamped4_c(block, dest, line_size);
2716 }
2717 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2718 {
2719     j_rev_dct4 (block);
2720     add_pixels_clamped4_c(block, dest, line_size);
2721 }
2722
2723 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2724 {
2725     j_rev_dct2 (block);
2726     put_pixels_clamped2_c(block, dest, line_size);
2727 }
2728 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2729 {
2730     j_rev_dct2 (block);
2731     add_pixels_clamped2_c(block, dest, line_size);
2732 }
2733
2734 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2735 {
2736     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2737
2738     dest[0] = cm[(block[0] + 4)>>3];
2739 }
2740 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2741 {
2742     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2743
2744     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2745 }
2746
2747 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2748
2749 /* init static data */
2750 av_cold void dsputil_static_init(void)
2751 {
2752     int i;
2753
2754     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2755     for(i=0;i<MAX_NEG_CROP;i++) {
2756         ff_cropTbl[i] = 0;
2757         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2758     }
2759
2760     for(i=0;i<512;i++) {
2761         ff_squareTbl[i] = (i - 256) * (i - 256);
2762     }
2763
2764     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2765 }
2766
2767 int ff_check_alignment(void){
2768     static int did_fail=0;
2769     LOCAL_ALIGNED_16(int, aligned, [4]);
2770
2771     if((intptr_t)aligned & 15){
2772         if(!did_fail){
2773 #if HAVE_MMX || HAVE_ALTIVEC
2774             av_log(NULL, AV_LOG_ERROR,
2775                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2776                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2777                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2778                 "Do not report crashes to Libav developers.\n");
2779 #endif
2780             did_fail=1;
2781         }
2782         return -1;
2783     }
2784     return 0;
2785 }
2786
2787 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2788 {
2789     int i;
2790
2791     ff_check_alignment();
2792
2793 #if CONFIG_ENCODERS
2794     if (avctx->bits_per_raw_sample == 10) {
2795         c->fdct    = ff_jpeg_fdct_islow_10;
2796         c->fdct248 = ff_fdct248_islow_10;
2797     } else {
2798         if(avctx->dct_algo==FF_DCT_FASTINT) {
2799             c->fdct    = fdct_ifast;
2800             c->fdct248 = fdct_ifast248;
2801         }
2802         else if(avctx->dct_algo==FF_DCT_FAAN) {
2803             c->fdct    = ff_faandct;
2804             c->fdct248 = ff_faandct248;
2805         }
2806         else {
2807             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2808             c->fdct248 = ff_fdct248_islow_8;
2809         }
2810     }
2811 #endif //CONFIG_ENCODERS
2812
2813     if(avctx->lowres==1){
2814         c->idct_put= ff_jref_idct4_put;
2815         c->idct_add= ff_jref_idct4_add;
2816         c->idct    = j_rev_dct4;
2817         c->idct_permutation_type= FF_NO_IDCT_PERM;
2818     }else if(avctx->lowres==2){
2819         c->idct_put= ff_jref_idct2_put;
2820         c->idct_add= ff_jref_idct2_add;
2821         c->idct    = j_rev_dct2;
2822         c->idct_permutation_type= FF_NO_IDCT_PERM;
2823     }else if(avctx->lowres==3){
2824         c->idct_put= ff_jref_idct1_put;
2825         c->idct_add= ff_jref_idct1_add;
2826         c->idct    = j_rev_dct1;
2827         c->idct_permutation_type= FF_NO_IDCT_PERM;
2828     }else{
2829         if (avctx->bits_per_raw_sample == 10) {
2830             c->idct_put              = ff_simple_idct_put_10;
2831             c->idct_add              = ff_simple_idct_add_10;
2832             c->idct                  = ff_simple_idct_10;
2833             c->idct_permutation_type = FF_NO_IDCT_PERM;
2834         } else {
2835         if(avctx->idct_algo==FF_IDCT_INT){
2836             c->idct_put= ff_jref_idct_put;
2837             c->idct_add= ff_jref_idct_add;
2838             c->idct    = j_rev_dct;
2839             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2840         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2841                 avctx->idct_algo==FF_IDCT_VP3){
2842             c->idct_put= ff_vp3_idct_put_c;
2843             c->idct_add= ff_vp3_idct_add_c;
2844             c->idct    = ff_vp3_idct_c;
2845             c->idct_permutation_type= FF_NO_IDCT_PERM;
2846         }else if(avctx->idct_algo==FF_IDCT_WMV2){
2847             c->idct_put= ff_wmv2_idct_put_c;
2848             c->idct_add= ff_wmv2_idct_add_c;
2849             c->idct    = ff_wmv2_idct_c;
2850             c->idct_permutation_type= FF_NO_IDCT_PERM;
2851         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2852             c->idct_put= ff_faanidct_put;
2853             c->idct_add= ff_faanidct_add;
2854             c->idct    = ff_faanidct;
2855             c->idct_permutation_type= FF_NO_IDCT_PERM;
2856         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2857             c->idct_put= ff_ea_idct_put_c;
2858             c->idct_permutation_type= FF_NO_IDCT_PERM;
2859         }else{ //accurate/default
2860             c->idct_put = ff_simple_idct_put_8;
2861             c->idct_add = ff_simple_idct_add_8;
2862             c->idct     = ff_simple_idct_8;
2863             c->idct_permutation_type= FF_NO_IDCT_PERM;
2864         }
2865         }
2866     }
2867
2868     c->diff_pixels = diff_pixels_c;
2869     c->put_pixels_clamped = ff_put_pixels_clamped_c;
2870     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2871     c->add_pixels_clamped = ff_add_pixels_clamped_c;
2872     c->sum_abs_dctelem = sum_abs_dctelem_c;
2873     c->gmc1 = gmc1_c;
2874     c->gmc = ff_gmc_c;
2875     c->pix_sum = pix_sum_c;
2876     c->pix_norm1 = pix_norm1_c;
2877
2878     c->fill_block_tab[0] = fill_block16_c;
2879     c->fill_block_tab[1] = fill_block8_c;
2880
2881     /* TODO [0] 16  [1] 8 */
2882     c->pix_abs[0][0] = pix_abs16_c;
2883     c->pix_abs[0][1] = pix_abs16_x2_c;
2884     c->pix_abs[0][2] = pix_abs16_y2_c;
2885     c->pix_abs[0][3] = pix_abs16_xy2_c;
2886     c->pix_abs[1][0] = pix_abs8_c;
2887     c->pix_abs[1][1] = pix_abs8_x2_c;
2888     c->pix_abs[1][2] = pix_abs8_y2_c;
2889     c->pix_abs[1][3] = pix_abs8_xy2_c;
2890
2891     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2892     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2893     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2894     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2895     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2896     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2897     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2898     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2899     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2900
2901     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2902     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2903     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2904     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2905     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2906     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2907     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2908     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2909     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2910
2911 #define dspfunc(PFX, IDX, NUM) \
2912     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2913     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2914     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2915     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2916     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2917     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2918     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2919     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2920     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2921     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2922     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2923     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2924     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2925     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2926     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2927     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2928
2929     dspfunc(put_qpel, 0, 16);
2930     dspfunc(put_no_rnd_qpel, 0, 16);
2931
2932     dspfunc(avg_qpel, 0, 16);
2933     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2934
2935     dspfunc(put_qpel, 1, 8);
2936     dspfunc(put_no_rnd_qpel, 1, 8);
2937
2938     dspfunc(avg_qpel, 1, 8);
2939     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2940
2941 #undef dspfunc
2942
2943 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2944     ff_mlp_init(c, avctx);
2945 #endif
2946 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2947     ff_intrax8dsp_init(c,avctx);
2948 #endif
2949
2950     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2951     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2952     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2953     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2954     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2955     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2956     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2957     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2958
2959 #define SET_CMP_FUNC(name) \
2960     c->name[0]= name ## 16_c;\
2961     c->name[1]= name ## 8x8_c;
2962
2963     SET_CMP_FUNC(hadamard8_diff)
2964     c->hadamard8_diff[4]= hadamard8_intra16_c;
2965     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2966     SET_CMP_FUNC(dct_sad)
2967     SET_CMP_FUNC(dct_max)
2968 #if CONFIG_GPL
2969     SET_CMP_FUNC(dct264_sad)
2970 #endif
2971     c->sad[0]= pix_abs16_c;
2972     c->sad[1]= pix_abs8_c;
2973     c->sse[0]= sse16_c;
2974     c->sse[1]= sse8_c;
2975     c->sse[2]= sse4_c;
2976     SET_CMP_FUNC(quant_psnr)
2977     SET_CMP_FUNC(rd)
2978     SET_CMP_FUNC(bit)
2979     c->vsad[0]= vsad16_c;
2980     c->vsad[4]= vsad_intra16_c;
2981     c->vsad[5]= vsad_intra8_c;
2982     c->vsse[0]= vsse16_c;
2983     c->vsse[4]= vsse_intra16_c;
2984     c->vsse[5]= vsse_intra8_c;
2985     c->nsse[0]= nsse16_c;
2986     c->nsse[1]= nsse8_c;
2987 #if CONFIG_DWT
2988     ff_dsputil_init_dwt(c);
2989 #endif
2990
2991     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2992
2993     c->add_bytes= add_bytes_c;
2994     c->add_bytes_l2= add_bytes_l2_c;
2995     c->diff_bytes= diff_bytes_c;
2996     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2997     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2998     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
2999     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3000     c->bswap_buf= bswap_buf;
3001     c->bswap16_buf = bswap16_buf;
3002 #if CONFIG_PNG_DECODER
3003     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
3004 #endif
3005
3006     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3007         c->h263_h_loop_filter= h263_h_loop_filter_c;
3008         c->h263_v_loop_filter= h263_v_loop_filter_c;
3009     }
3010
3011     if (CONFIG_VP3_DECODER) {
3012         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3013         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3014         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3015     }
3016
3017     c->h261_loop_filter= h261_loop_filter_c;
3018
3019     c->try_8x8basis= try_8x8basis_c;
3020     c->add_8x8basis= add_8x8basis_c;
3021
3022 #if CONFIG_VORBIS_DECODER
3023     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3024 #endif
3025 #if CONFIG_AC3_DECODER
3026     c->ac3_downmix = ff_ac3_downmix_c;
3027 #endif
3028     c->vector_fmul = vector_fmul_c;
3029     c->vector_fmul_reverse = vector_fmul_reverse_c;
3030     c->vector_fmul_add = vector_fmul_add_c;
3031     c->vector_fmul_window = vector_fmul_window_c;
3032     c->vector_clipf = vector_clipf_c;
3033     c->scalarproduct_int16 = scalarproduct_int16_c;
3034     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3035     c->apply_window_int16 = apply_window_int16_c;
3036     c->vector_clip_int32 = vector_clip_int32_c;
3037     c->scalarproduct_float = scalarproduct_float_c;
3038     c->butterflies_float = butterflies_float_c;
3039     c->vector_fmul_scalar = vector_fmul_scalar_c;
3040     c->vector_fmac_scalar = vector_fmac_scalar_c;
3041
3042     c->shrink[0]= av_image_copy_plane;
3043     c->shrink[1]= ff_shrink22;
3044     c->shrink[2]= ff_shrink44;
3045     c->shrink[3]= ff_shrink88;
3046
3047     c->prefetch= just_return;
3048
3049     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3050     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3051
3052 #undef FUNC
3053 #undef FUNCC
3054 #define FUNC(f, depth) f ## _ ## depth
3055 #define FUNCC(f, depth) f ## _ ## depth ## _c
3056
3057 #define dspfunc1(PFX, IDX, NUM, depth)\
3058     c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
3059     c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3060     c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3061     c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3062
3063 #define dspfunc2(PFX, IDX, NUM, depth)\
3064     c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3065     c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3066     c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3067     c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3068     c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3069     c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3070     c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3071     c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3072     c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3073     c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3074     c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3075     c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3076     c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3077     c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3078     c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3079     c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3080
3081
3082 #define BIT_DEPTH_FUNCS(depth, dct)\
3083     c->get_pixels                    = FUNCC(get_pixels   ## dct   , depth);\
3084     c->draw_edges                    = FUNCC(draw_edges            , depth);\
3085     c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
3086     c->clear_block                   = FUNCC(clear_block  ## dct   , depth);\
3087     c->clear_blocks                  = FUNCC(clear_blocks ## dct   , depth);\
3088     c->add_pixels8                   = FUNCC(add_pixels8  ## dct   , depth);\
3089     c->add_pixels4                   = FUNCC(add_pixels4  ## dct   , depth);\
3090     c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
3091     c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3092 \
3093     c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
3094     c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
3095     c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
3096     c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
3097     c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
3098     c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
3099 \
3100     dspfunc1(put       , 0, 16, depth);\
3101     dspfunc1(put       , 1,  8, depth);\
3102     dspfunc1(put       , 2,  4, depth);\
3103     dspfunc1(put       , 3,  2, depth);\
3104     dspfunc1(put_no_rnd, 0, 16, depth);\
3105     dspfunc1(put_no_rnd, 1,  8, depth);\
3106     dspfunc1(avg       , 0, 16, depth);\
3107     dspfunc1(avg       , 1,  8, depth);\
3108     dspfunc1(avg       , 2,  4, depth);\
3109     dspfunc1(avg       , 3,  2, depth);\
3110     dspfunc1(avg_no_rnd, 0, 16, depth);\
3111     dspfunc1(avg_no_rnd, 1,  8, depth);\
3112 \
3113     dspfunc2(put_h264_qpel, 0, 16, depth);\
3114     dspfunc2(put_h264_qpel, 1,  8, depth);\
3115     dspfunc2(put_h264_qpel, 2,  4, depth);\
3116     dspfunc2(put_h264_qpel, 3,  2, depth);\
3117     dspfunc2(avg_h264_qpel, 0, 16, depth);\
3118     dspfunc2(avg_h264_qpel, 1,  8, depth);\
3119     dspfunc2(avg_h264_qpel, 2,  4, depth);
3120
3121     switch (avctx->bits_per_raw_sample) {
3122     case 9:
3123         if (c->dct_bits == 32) {
3124             BIT_DEPTH_FUNCS(9, _32);
3125         } else {
3126             BIT_DEPTH_FUNCS(9, _16);
3127         }
3128         break;
3129     case 10:
3130         if (c->dct_bits == 32) {
3131             BIT_DEPTH_FUNCS(10, _32);
3132         } else {
3133             BIT_DEPTH_FUNCS(10, _16);
3134         }
3135         break;
3136     default:
3137         av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3138     case 8:
3139         BIT_DEPTH_FUNCS(8, _16);
3140         break;
3141     }
3142
3143
3144     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
3145     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
3146     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
3147     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
3148     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
3149     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
3150     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
3151     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
3152     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
3153
3154     for(i=0; i<64; i++){
3155         if(!c->put_2tap_qpel_pixels_tab[0][i])
3156             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3157         if(!c->avg_2tap_qpel_pixels_tab[0][i])
3158             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3159     }
3160
3161     ff_init_scantable_permutation(c->idct_permutation,
3162                                   c->idct_permutation_type);
3163 }