git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41 #include "diracdsp.h"
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 #define pixeltmp int16_t
  47 #define BIT_DEPTH 9
  48 #include "dsputil_template.c"
  49 #undef BIT_DEPTH
  50
  51 #define BIT_DEPTH 10
  52 #include "dsputil_template.c"
  53 #undef BIT_DEPTH
  54
  55 #undef pixeltmp
  56 #define pixeltmp int32_t
  57 #define BIT_DEPTH 12
  58 #include "dsputil_template.c"
  59 #undef BIT_DEPTH
  60
  61 #define BIT_DEPTH 14
  62 #include "dsputil_template.c"
  63 #undef BIT_DEPTH
  64
  65 #undef pixeltmp
  66 #define pixeltmp int16_t
  67 #define BIT_DEPTH 8
  68 #include "dsputil_template.c"
  69 #undef pixeltmp
  70
  71 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  72 #define pb_7f (~0UL/255 * 0x7f)
  73 #define pb_80 (~0UL/255 * 0x80)
  74
  75 const uint8_t ff_zigzag_direct[64] = {
  76     0,   1,  8, 16,  9,  2,  3, 10,
  77     17, 24, 32, 25, 18, 11,  4,  5,
  78     12, 19, 26, 33, 40, 48, 41, 34,
  79     27, 20, 13,  6,  7, 14, 21, 28,
  80     35, 42, 49, 56, 57, 50, 43, 36,
  81     29, 22, 15, 23, 30, 37, 44, 51,
  82     58, 59, 52, 45, 38, 31, 39, 46,
  83     53, 60, 61, 54, 47, 55, 62, 63
  84 };
  85
  86 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  87    specification, we interleave the fields */
  88 const uint8_t ff_zigzag248_direct[64] = {
  89      0,  8,  1,  9, 16, 24,  2, 10,
  90     17, 25, 32, 40, 48, 56, 33, 41,
  91     18, 26,  3, 11,  4, 12, 19, 27,
  92     34, 42, 49, 57, 50, 58, 35, 43,
  93     20, 28,  5, 13,  6, 14, 21, 29,
  94     36, 44, 51, 59, 52, 60, 37, 45,
  95     22, 30,  7, 15, 23, 31, 38, 46,
  96     53, 61, 54, 62, 39, 47, 55, 63,
  97 };
  98
  99 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
 100 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
 101
 102 const uint8_t ff_alternate_horizontal_scan[64] = {
 103     0,  1,   2,  3,  8,  9, 16, 17,
 104     10, 11,  4,  5,  6,  7, 15, 14,
 105     13, 12, 19, 18, 24, 25, 32, 33,
 106     26, 27, 20, 21, 22, 23, 28, 29,
 107     30, 31, 34, 35, 40, 41, 48, 49,
 108     42, 43, 36, 37, 38, 39, 44, 45,
 109     46, 47, 50, 51, 56, 57, 58, 59,
 110     52, 53, 54, 55, 60, 61, 62, 63,
 111 };
 112
 113 const uint8_t ff_alternate_vertical_scan[64] = {
 114     0,  8,  16, 24,  1,  9,  2, 10,
 115     17, 25, 32, 40, 48, 56, 57, 49,
 116     41, 33, 26, 18,  3, 11,  4, 12,
 117     19, 27, 34, 42, 50, 58, 35, 43,
 118     51, 59, 20, 28,  5, 13,  6, 14,
 119     21, 29, 36, 44, 52, 60, 37, 45,
 120     53, 61, 22, 30,  7, 15, 23, 31,
 121     38, 46, 54, 62, 39, 47, 55, 63,
 122 };
 123
 124 /* Input permutation for the simple_idct_mmx */
 125 static const uint8_t simple_mmx_permutation[64]={
 126         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 127         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 128         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 129         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 130         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 131         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 132         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 133         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 134 };
 135
 136 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 137
 138 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 139     int i;
 140     int end;
 141
 142     st->scantable= src_scantable;
 143
 144     for(i=0; i<64; i++){
 145         int j;
 146         j = src_scantable[i];
 147         st->permutated[i] = permutation[j];
 148     }
 149
 150     end=-1;
 151     for(i=0; i<64; i++){
 152         int j;
 153         j = st->permutated[i];
 154         if(j>end) end=j;
 155         st->raster_end[i]= end;
 156     }
 157 }
 158
 159 void ff_init_scantable_permutation(uint8_t *idct_permutation,
 160                                    int idct_permutation_type)
 161 {
 162     int i;
 163
 164     switch(idct_permutation_type){
 165     case FF_NO_IDCT_PERM:
 166         for(i=0; i<64; i++)
 167             idct_permutation[i]= i;
 168         break;
 169     case FF_LIBMPEG2_IDCT_PERM:
 170         for(i=0; i<64; i++)
 171             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 172         break;
 173     case FF_SIMPLE_IDCT_PERM:
 174         for(i=0; i<64; i++)
 175             idct_permutation[i]= simple_mmx_permutation[i];
 176         break;
 177     case FF_TRANSPOSE_IDCT_PERM:
 178         for(i=0; i<64; i++)
 179             idct_permutation[i]= ((i&7)<<3) | (i>>3);
 180         break;
 181     case FF_PARTTRANS_IDCT_PERM:
 182         for(i=0; i<64; i++)
 183             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
 184         break;
 185     case FF_SSE2_IDCT_PERM:
 186         for(i=0; i<64; i++)
 187             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
 188         break;
 189     default:
 190         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
 191     }
 192 }
 193
 194 static int pix_sum_c(uint8_t * pix, int line_size)
 195 {
 196     int s, i, j;
 197
 198     s = 0;
 199     for (i = 0; i < 16; i++) {
 200         for (j = 0; j < 16; j += 8) {
 201             s += pix[0];
 202             s += pix[1];
 203             s += pix[2];
 204             s += pix[3];
 205             s += pix[4];
 206             s += pix[5];
 207             s += pix[6];
 208             s += pix[7];
 209             pix += 8;
 210         }
 211         pix += line_size - 16;
 212     }
 213     return s;
 214 }
 215
 216 static int pix_norm1_c(uint8_t * pix, int line_size)
 217 {
 218     int s, i, j;
 219     uint32_t *sq = ff_squareTbl + 256;
 220
 221     s = 0;
 222     for (i = 0; i < 16; i++) {
 223         for (j = 0; j < 16; j += 8) {
 224 #if 0
 225             s += sq[pix[0]];
 226             s += sq[pix[1]];
 227             s += sq[pix[2]];
 228             s += sq[pix[3]];
 229             s += sq[pix[4]];
 230             s += sq[pix[5]];
 231             s += sq[pix[6]];
 232             s += sq[pix[7]];
 233 #else
 234 #if HAVE_FAST_64BIT
 235             register uint64_t x=*(uint64_t*)pix;
 236             s += sq[x&0xff];
 237             s += sq[(x>>8)&0xff];
 238             s += sq[(x>>16)&0xff];
 239             s += sq[(x>>24)&0xff];
 240             s += sq[(x>>32)&0xff];
 241             s += sq[(x>>40)&0xff];
 242             s += sq[(x>>48)&0xff];
 243             s += sq[(x>>56)&0xff];
 244 #else
 245             register uint32_t x=*(uint32_t*)pix;
 246             s += sq[x&0xff];
 247             s += sq[(x>>8)&0xff];
 248             s += sq[(x>>16)&0xff];
 249             s += sq[(x>>24)&0xff];
 250             x=*(uint32_t*)(pix+4);
 251             s += sq[x&0xff];
 252             s += sq[(x>>8)&0xff];
 253             s += sq[(x>>16)&0xff];
 254             s += sq[(x>>24)&0xff];
 255 #endif
 256 #endif
 257             pix += 8;
 258         }
 259         pix += line_size - 16;
 260     }
 261     return s;
 262 }
 263
 264 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 265     int i;
 266
 267     for(i=0; i+8<=w; i+=8){
 268         dst[i+0]= av_bswap32(src[i+0]);
 269         dst[i+1]= av_bswap32(src[i+1]);
 270         dst[i+2]= av_bswap32(src[i+2]);
 271         dst[i+3]= av_bswap32(src[i+3]);
 272         dst[i+4]= av_bswap32(src[i+4]);
 273         dst[i+5]= av_bswap32(src[i+5]);
 274         dst[i+6]= av_bswap32(src[i+6]);
 275         dst[i+7]= av_bswap32(src[i+7]);
 276     }
 277     for(;i<w; i++){
 278         dst[i+0]= av_bswap32(src[i+0]);
 279     }
 280 }
 281
 282 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 283 {
 284     while (len--)
 285         *dst++ = av_bswap16(*src++);
 286 }
 287
 288 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 289 {
 290     int s, i;
 291     uint32_t *sq = ff_squareTbl + 256;
 292
 293     s = 0;
 294     for (i = 0; i < h; i++) {
 295         s += sq[pix1[0] - pix2[0]];
 296         s += sq[pix1[1] - pix2[1]];
 297         s += sq[pix1[2] - pix2[2]];
 298         s += sq[pix1[3] - pix2[3]];
 299         pix1 += line_size;
 300         pix2 += line_size;
 301     }
 302     return s;
 303 }
 304
 305 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 306 {
 307     int s, i;
 308     uint32_t *sq = ff_squareTbl + 256;
 309
 310     s = 0;
 311     for (i = 0; i < h; i++) {
 312         s += sq[pix1[0] - pix2[0]];
 313         s += sq[pix1[1] - pix2[1]];
 314         s += sq[pix1[2] - pix2[2]];
 315         s += sq[pix1[3] - pix2[3]];
 316         s += sq[pix1[4] - pix2[4]];
 317         s += sq[pix1[5] - pix2[5]];
 318         s += sq[pix1[6] - pix2[6]];
 319         s += sq[pix1[7] - pix2[7]];
 320         pix1 += line_size;
 321         pix2 += line_size;
 322     }
 323     return s;
 324 }
 325
 326 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 327 {
 328     int s, i;
 329     uint32_t *sq = ff_squareTbl + 256;
 330
 331     s = 0;
 332     for (i = 0; i < h; i++) {
 333         s += sq[pix1[ 0] - pix2[ 0]];
 334         s += sq[pix1[ 1] - pix2[ 1]];
 335         s += sq[pix1[ 2] - pix2[ 2]];
 336         s += sq[pix1[ 3] - pix2[ 3]];
 337         s += sq[pix1[ 4] - pix2[ 4]];
 338         s += sq[pix1[ 5] - pix2[ 5]];
 339         s += sq[pix1[ 6] - pix2[ 6]];
 340         s += sq[pix1[ 7] - pix2[ 7]];
 341         s += sq[pix1[ 8] - pix2[ 8]];
 342         s += sq[pix1[ 9] - pix2[ 9]];
 343         s += sq[pix1[10] - pix2[10]];
 344         s += sq[pix1[11] - pix2[11]];
 345         s += sq[pix1[12] - pix2[12]];
 346         s += sq[pix1[13] - pix2[13]];
 347         s += sq[pix1[14] - pix2[14]];
 348         s += sq[pix1[15] - pix2[15]];
 349
 350         pix1 += line_size;
 351         pix2 += line_size;
 352     }
 353     return s;
 354 }
 355
 356 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 357                           const uint8_t *s2, int stride){
 358     int i;
 359
 360     /* read the pixels */
 361     for(i=0;i<8;i++) {
 362         block[0] = s1[0] - s2[0];
 363         block[1] = s1[1] - s2[1];
 364         block[2] = s1[2] - s2[2];
 365         block[3] = s1[3] - s2[3];
 366         block[4] = s1[4] - s2[4];
 367         block[5] = s1[5] - s2[5];
 368         block[6] = s1[6] - s2[6];
 369         block[7] = s1[7] - s2[7];
 370         s1 += stride;
 371         s2 += stride;
 372         block += 8;
 373     }
 374 }
 375
 376
 377 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 378                              int line_size)
 379 {
 380     int i;
 381
 382     /* read the pixels */
 383     for(i=0;i<8;i++) {
 384         pixels[0] = av_clip_uint8(block[0]);
 385         pixels[1] = av_clip_uint8(block[1]);
 386         pixels[2] = av_clip_uint8(block[2]);
 387         pixels[3] = av_clip_uint8(block[3]);
 388         pixels[4] = av_clip_uint8(block[4]);
 389         pixels[5] = av_clip_uint8(block[5]);
 390         pixels[6] = av_clip_uint8(block[6]);
 391         pixels[7] = av_clip_uint8(block[7]);
 392
 393         pixels += line_size;
 394         block += 8;
 395     }
 396 }
 397
 398 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 399                                  int line_size)
 400 {
 401     int i;
 402
 403     /* read the pixels */
 404     for(i=0;i<4;i++) {
 405         pixels[0] = av_clip_uint8(block[0]);
 406         pixels[1] = av_clip_uint8(block[1]);
 407         pixels[2] = av_clip_uint8(block[2]);
 408         pixels[3] = av_clip_uint8(block[3]);
 409
 410         pixels += line_size;
 411         block += 8;
 412     }
 413 }
 414
 415 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 416                                  int line_size)
 417 {
 418     int i;
 419
 420     /* read the pixels */
 421     for(i=0;i<2;i++) {
 422         pixels[0] = av_clip_uint8(block[0]);
 423         pixels[1] = av_clip_uint8(block[1]);
 424
 425         pixels += line_size;
 426         block += 8;
 427     }
 428 }
 429
 430 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
 431                                     uint8_t *restrict pixels,
 432                                     int line_size)
 433 {
 434     int i, j;
 435
 436     for (i = 0; i < 8; i++) {
 437         for (j = 0; j < 8; j++) {
 438             if (*block < -128)
 439                 *pixels = 0;
 440             else if (*block > 127)
 441                 *pixels = 255;
 442             else
 443                 *pixels = (uint8_t)(*block + 128);
 444             block++;
 445             pixels++;
 446         }
 447         pixels += (line_size - 8);
 448     }
 449 }
 450
 451 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 452                              int line_size)
 453 {
 454     int i;
 455
 456     /* read the pixels */
 457     for(i=0;i<8;i++) {
 458         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 459         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 460         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 461         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 462         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
 463         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
 464         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
 465         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
 466         pixels += line_size;
 467         block += 8;
 468     }
 469 }
 470
 471 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 472                           int line_size)
 473 {
 474     int i;
 475
 476     /* read the pixels */
 477     for(i=0;i<4;i++) {
 478         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 479         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 480         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 481         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 482         pixels += line_size;
 483         block += 8;
 484     }
 485 }
 486
 487 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 488                           int line_size)
 489 {
 490     int i;
 491
 492     /* read the pixels */
 493     for(i=0;i<2;i++) {
 494         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 495         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 496         pixels += line_size;
 497         block += 8;
 498     }
 499 }
 500
 501 static int sum_abs_dctelem_c(DCTELEM *block)
 502 {
 503     int sum=0, i;
 504     for(i=0; i<64; i++)
 505         sum+= FFABS(block[i]);
 506     return sum;
 507 }
 508
 509 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 510 {
 511     int i;
 512
 513     for (i = 0; i < h; i++) {
 514         memset(block, value, 16);
 515         block += line_size;
 516     }
 517 }
 518
 519 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 520 {
 521     int i;
 522
 523     for (i = 0; i < h; i++) {
 524         memset(block, value, 8);
 525         block += line_size;
 526     }
 527 }
 528
 529 #define avg2(a,b) ((a+b+1)>>1)
 530 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 531
 532 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 533 {
 534     const int A=(16-x16)*(16-y16);
 535     const int B=(   x16)*(16-y16);
 536     const int C=(16-x16)*(   y16);
 537     const int D=(   x16)*(   y16);
 538     int i;
 539
 540     for(i=0; i<h; i++)
 541     {
 542         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 543         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 544         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 545         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 546         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 547         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 548         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 549         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 550         dst+= stride;
 551         src+= stride;
 552     }
 553 }
 554
 555 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 556                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 557 {
 558     int y, vx, vy;
 559     const int s= 1<<shift;
 560
 561     width--;
 562     height--;
 563
 564     for(y=0; y<h; y++){
 565         int x;
 566
 567         vx= ox;
 568         vy= oy;
 569         for(x=0; x<8; x++){ //XXX FIXME optimize
 570             int src_x, src_y, frac_x, frac_y, index;
 571
 572             src_x= vx>>16;
 573             src_y= vy>>16;
 574             frac_x= src_x&(s-1);
 575             frac_y= src_y&(s-1);
 576             src_x>>=shift;
 577             src_y>>=shift;
 578
 579             if((unsigned)src_x < width){
 580                 if((unsigned)src_y < height){
 581                     index= src_x + src_y*stride;
 582                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 583                                            + src[index       +1]*   frac_x )*(s-frac_y)
 584                                         + (  src[index+stride  ]*(s-frac_x)
 585                                            + src[index+stride+1]*   frac_x )*   frac_y
 586                                         + r)>>(shift*2);
 587                 }else{
 588                     index= src_x + av_clip(src_y, 0, height)*stride;
 589                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 590                                           + src[index       +1]*   frac_x )*s
 591                                         + r)>>(shift*2);
 592                 }
 593             }else{
 594                 if((unsigned)src_y < height){
 595                     index= av_clip(src_x, 0, width) + src_y*stride;
 596                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 597                                            + src[index+stride  ]*   frac_y )*s
 598                                         + r)>>(shift*2);
 599                 }else{
 600                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 601                     dst[y*stride + x]=    src[index         ];
 602                 }
 603             }
 604
 605             vx+= dxx;
 606             vy+= dyx;
 607         }
 608         ox += dxy;
 609         oy += dyy;
 610     }
 611 }
 612
 613 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 614     switch(width){
 615     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 616     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 617     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 618     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 619     }
 620 }
 621
 622 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 623     int i,j;
 624     for (i=0; i < height; i++) {
 625       for (j=0; j < width; j++) {
 626         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 627       }
 628       src += stride;
 629       dst += stride;
 630     }
 631 }
 632
 633 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 634     int i,j;
 635     for (i=0; i < height; i++) {
 636       for (j=0; j < width; j++) {
 637         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 638       }
 639       src += stride;
 640       dst += stride;
 641     }
 642 }
 643
 644 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 645     int i,j;
 646     for (i=0; i < height; i++) {
 647       for (j=0; j < width; j++) {
 648         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 649       }
 650       src += stride;
 651       dst += stride;
 652     }
 653 }
 654
 655 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 656     int i,j;
 657     for (i=0; i < height; i++) {
 658       for (j=0; j < width; j++) {
 659         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 660       }
 661       src += stride;
 662       dst += stride;
 663     }
 664 }
 665
 666 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 667     int i,j;
 668     for (i=0; i < height; i++) {
 669       for (j=0; j < width; j++) {
 670         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 671       }
 672       src += stride;
 673       dst += stride;
 674     }
 675 }
 676
 677 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 678     int i,j;
 679     for (i=0; i < height; i++) {
 680       for (j=0; j < width; j++) {
 681         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 682       }
 683       src += stride;
 684       dst += stride;
 685     }
 686 }
 687
 688 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 689     int i,j;
 690     for (i=0; i < height; i++) {
 691       for (j=0; j < width; j++) {
 692         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 693       }
 694       src += stride;
 695       dst += stride;
 696     }
 697 }
 698
 699 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 700     int i,j;
 701     for (i=0; i < height; i++) {
 702       for (j=0; j < width; j++) {
 703         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 704       }
 705       src += stride;
 706       dst += stride;
 707     }
 708 }
 709
 710 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 711     switch(width){
 712     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 713     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 714     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 715     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 716     }
 717 }
 718
 719 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 720     int i,j;
 721     for (i=0; i < height; i++) {
 722       for (j=0; j < width; j++) {
 723         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 724       }
 725       src += stride;
 726       dst += stride;
 727     }
 728 }
 729
 730 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 731     int i,j;
 732     for (i=0; i < height; i++) {
 733       for (j=0; j < width; j++) {
 734         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 735       }
 736       src += stride;
 737       dst += stride;
 738     }
 739 }
 740
 741 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 742     int i,j;
 743     for (i=0; i < height; i++) {
 744       for (j=0; j < width; j++) {
 745         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 746       }
 747       src += stride;
 748       dst += stride;
 749     }
 750 }
 751
 752 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 753     int i,j;
 754     for (i=0; i < height; i++) {
 755       for (j=0; j < width; j++) {
 756         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 757       }
 758       src += stride;
 759       dst += stride;
 760     }
 761 }
 762
 763 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 764     int i,j;
 765     for (i=0; i < height; i++) {
 766       for (j=0; j < width; j++) {
 767         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 768       }
 769       src += stride;
 770       dst += stride;
 771     }
 772 }
 773
 774 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 775     int i,j;
 776     for (i=0; i < height; i++) {
 777       for (j=0; j < width; j++) {
 778         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 779       }
 780       src += stride;
 781       dst += stride;
 782     }
 783 }
 784
 785 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 786     int i,j;
 787     for (i=0; i < height; i++) {
 788       for (j=0; j < width; j++) {
 789         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 790       }
 791       src += stride;
 792       dst += stride;
 793     }
 794 }
 795
 796 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 797     int i,j;
 798     for (i=0; i < height; i++) {
 799       for (j=0; j < width; j++) {
 800         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 801       }
 802       src += stride;
 803       dst += stride;
 804     }
 805 }
 806
 807 #define QPEL_MC(r, OPNAME, RND, OP) \
 808 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 809     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 810     int i;\
 811     for(i=0; i<h; i++)\
 812     {\
 813         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 814         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 815         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 816         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 817         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 818         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 819         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 820         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 821         dst+=dstStride;\
 822         src+=srcStride;\
 823     }\
 824 }\
 825 \
 826 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 827     const int w=8;\
 828     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 829     int i;\
 830     for(i=0; i<w; i++)\
 831     {\
 832         const int src0= src[0*srcStride];\
 833         const int src1= src[1*srcStride];\
 834         const int src2= src[2*srcStride];\
 835         const int src3= src[3*srcStride];\
 836         const int src4= src[4*srcStride];\
 837         const int src5= src[5*srcStride];\
 838         const int src6= src[6*srcStride];\
 839         const int src7= src[7*srcStride];\
 840         const int src8= src[8*srcStride];\
 841         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 842         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 843         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 844         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 845         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 846         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 847         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 848         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 849         dst++;\
 850         src++;\
 851     }\
 852 }\
 853 \
 854 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 855     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 856     int i;\
 857     \
 858     for(i=0; i<h; i++)\
 859     {\
 860         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 861         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 862         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 863         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 864         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 865         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 866         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 867         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 868         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 869         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 870         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 871         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 872         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 873         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 874         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 875         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 876         dst+=dstStride;\
 877         src+=srcStride;\
 878     }\
 879 }\
 880 \
 881 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 882     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 883     int i;\
 884     const int w=16;\
 885     for(i=0; i<w; i++)\
 886     {\
 887         const int src0= src[0*srcStride];\
 888         const int src1= src[1*srcStride];\
 889         const int src2= src[2*srcStride];\
 890         const int src3= src[3*srcStride];\
 891         const int src4= src[4*srcStride];\
 892         const int src5= src[5*srcStride];\
 893         const int src6= src[6*srcStride];\
 894         const int src7= src[7*srcStride];\
 895         const int src8= src[8*srcStride];\
 896         const int src9= src[9*srcStride];\
 897         const int src10= src[10*srcStride];\
 898         const int src11= src[11*srcStride];\
 899         const int src12= src[12*srcStride];\
 900         const int src13= src[13*srcStride];\
 901         const int src14= src[14*srcStride];\
 902         const int src15= src[15*srcStride];\
 903         const int src16= src[16*srcStride];\
 904         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 905         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 906         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 907         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 908         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 909         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 910         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 911         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 912         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 913         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 914         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 915         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 916         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 917         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 918         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 919         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 920         dst++;\
 921         src++;\
 922     }\
 923 }\
 924 \
 925 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
 926     uint8_t half[64];\
 927     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 928     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 929 }\
 930 \
 931 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
 932     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 933 }\
 934 \
 935 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
 936     uint8_t half[64];\
 937     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 938     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 939 }\
 940 \
 941 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
 942     uint8_t full[16*9];\
 943     uint8_t half[64];\
 944     copy_block9(full, src, 16, stride, 9);\
 945     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 946     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 947 }\
 948 \
 949 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
 950     uint8_t full[16*9];\
 951     copy_block9(full, src, 16, stride, 9);\
 952     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 953 }\
 954 \
 955 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
 956     uint8_t full[16*9];\
 957     uint8_t half[64];\
 958     copy_block9(full, src, 16, stride, 9);\
 959     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 960     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 961 }\
 962 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
 963     uint8_t full[16*9];\
 964     uint8_t halfH[72];\
 965     uint8_t halfV[64];\
 966     uint8_t halfHV[64];\
 967     copy_block9(full, src, 16, stride, 9);\
 968     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 969     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 970     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 971     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 972 }\
 973 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
 974     uint8_t full[16*9];\
 975     uint8_t halfH[72];\
 976     uint8_t halfHV[64];\
 977     copy_block9(full, src, 16, stride, 9);\
 978     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 979     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 980     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 981     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 982 }\
 983 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
 984     uint8_t full[16*9];\
 985     uint8_t halfH[72];\
 986     uint8_t halfV[64];\
 987     uint8_t halfHV[64];\
 988     copy_block9(full, src, 16, stride, 9);\
 989     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 990     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 991     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 992     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 993 }\
 994 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
 995     uint8_t full[16*9];\
 996     uint8_t halfH[72];\
 997     uint8_t halfHV[64];\
 998     copy_block9(full, src, 16, stride, 9);\
 999     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1000     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1001     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1002     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1003 }\
1004 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1005     uint8_t full[16*9];\
1006     uint8_t halfH[72];\
1007     uint8_t halfV[64];\
1008     uint8_t halfHV[64];\
1009     copy_block9(full, src, 16, stride, 9);\
1010     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1011     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1012     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1013     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1014 }\
1015 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1016     uint8_t full[16*9];\
1017     uint8_t halfH[72];\
1018     uint8_t halfHV[64];\
1019     copy_block9(full, src, 16, stride, 9);\
1020     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1021     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1022     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1023     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1024 }\
1025 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1026     uint8_t full[16*9];\
1027     uint8_t halfH[72];\
1028     uint8_t halfV[64];\
1029     uint8_t halfHV[64];\
1030     copy_block9(full, src, 16, stride, 9);\
1031     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1032     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1033     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1034     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1035 }\
1036 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1037     uint8_t full[16*9];\
1038     uint8_t halfH[72];\
1039     uint8_t halfHV[64];\
1040     copy_block9(full, src, 16, stride, 9);\
1041     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1042     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1043     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1044     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1045 }\
1046 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1047     uint8_t halfH[72];\
1048     uint8_t halfHV[64];\
1049     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1050     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1051     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1052 }\
1053 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1054     uint8_t halfH[72];\
1055     uint8_t halfHV[64];\
1056     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1057     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1058     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1059 }\
1060 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1061     uint8_t full[16*9];\
1062     uint8_t halfH[72];\
1063     uint8_t halfV[64];\
1064     uint8_t halfHV[64];\
1065     copy_block9(full, src, 16, stride, 9);\
1066     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1067     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1068     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1069     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1070 }\
1071 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1072     uint8_t full[16*9];\
1073     uint8_t halfH[72];\
1074     copy_block9(full, src, 16, stride, 9);\
1075     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1076     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1077     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1078 }\
1079 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1080     uint8_t full[16*9];\
1081     uint8_t halfH[72];\
1082     uint8_t halfV[64];\
1083     uint8_t halfHV[64];\
1084     copy_block9(full, src, 16, stride, 9);\
1085     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1086     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1087     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1088     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1089 }\
1090 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1091     uint8_t full[16*9];\
1092     uint8_t halfH[72];\
1093     copy_block9(full, src, 16, stride, 9);\
1094     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1095     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1096     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1097 }\
1098 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1099     uint8_t halfH[72];\
1100     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1101     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1102 }\
1103 \
1104 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1105     uint8_t half[256];\
1106     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1107     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1108 }\
1109 \
1110 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1111     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1112 }\
1113 \
1114 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1115     uint8_t half[256];\
1116     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1117     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1118 }\
1119 \
1120 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1121     uint8_t full[24*17];\
1122     uint8_t half[256];\
1123     copy_block17(full, src, 24, stride, 17);\
1124     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1125     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1126 }\
1127 \
1128 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1129     uint8_t full[24*17];\
1130     copy_block17(full, src, 24, stride, 17);\
1131     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1132 }\
1133 \
1134 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1135     uint8_t full[24*17];\
1136     uint8_t half[256];\
1137     copy_block17(full, src, 24, stride, 17);\
1138     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1139     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1140 }\
1141 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1142     uint8_t full[24*17];\
1143     uint8_t halfH[272];\
1144     uint8_t halfV[256];\
1145     uint8_t halfHV[256];\
1146     copy_block17(full, src, 24, stride, 17);\
1147     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1148     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1149     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1150     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1151 }\
1152 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1153     uint8_t full[24*17];\
1154     uint8_t halfH[272];\
1155     uint8_t halfHV[256];\
1156     copy_block17(full, src, 24, stride, 17);\
1157     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1158     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1159     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1160     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1161 }\
1162 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1163     uint8_t full[24*17];\
1164     uint8_t halfH[272];\
1165     uint8_t halfV[256];\
1166     uint8_t halfHV[256];\
1167     copy_block17(full, src, 24, stride, 17);\
1168     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1169     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1170     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1171     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1172 }\
1173 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1174     uint8_t full[24*17];\
1175     uint8_t halfH[272];\
1176     uint8_t halfHV[256];\
1177     copy_block17(full, src, 24, stride, 17);\
1178     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1179     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1180     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1181     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1182 }\
1183 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1184     uint8_t full[24*17];\
1185     uint8_t halfH[272];\
1186     uint8_t halfV[256];\
1187     uint8_t halfHV[256];\
1188     copy_block17(full, src, 24, stride, 17);\
1189     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1190     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1191     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1192     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1193 }\
1194 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1195     uint8_t full[24*17];\
1196     uint8_t halfH[272];\
1197     uint8_t halfHV[256];\
1198     copy_block17(full, src, 24, stride, 17);\
1199     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1200     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1201     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1202     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1203 }\
1204 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1205     uint8_t full[24*17];\
1206     uint8_t halfH[272];\
1207     uint8_t halfV[256];\
1208     uint8_t halfHV[256];\
1209     copy_block17(full, src, 24, stride, 17);\
1210     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1211     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1212     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1213     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1214 }\
1215 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1216     uint8_t full[24*17];\
1217     uint8_t halfH[272];\
1218     uint8_t halfHV[256];\
1219     copy_block17(full, src, 24, stride, 17);\
1220     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1221     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1222     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1223     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1224 }\
1225 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1226     uint8_t halfH[272];\
1227     uint8_t halfHV[256];\
1228     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1229     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1230     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1231 }\
1232 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1233     uint8_t halfH[272];\
1234     uint8_t halfHV[256];\
1235     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1236     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1237     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1238 }\
1239 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1240     uint8_t full[24*17];\
1241     uint8_t halfH[272];\
1242     uint8_t halfV[256];\
1243     uint8_t halfHV[256];\
1244     copy_block17(full, src, 24, stride, 17);\
1245     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1246     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1247     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1248     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1249 }\
1250 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1251     uint8_t full[24*17];\
1252     uint8_t halfH[272];\
1253     copy_block17(full, src, 24, stride, 17);\
1254     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1255     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1256     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1257 }\
1258 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1259     uint8_t full[24*17];\
1260     uint8_t halfH[272];\
1261     uint8_t halfV[256];\
1262     uint8_t halfHV[256];\
1263     copy_block17(full, src, 24, stride, 17);\
1264     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1265     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1266     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1267     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1268 }\
1269 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1270     uint8_t full[24*17];\
1271     uint8_t halfH[272];\
1272     copy_block17(full, src, 24, stride, 17);\
1273     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1274     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1275     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1276 }\
1277 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1278     uint8_t halfH[272];\
1279     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1280     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1281 }
1282
1283 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1284 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1285 #define op_put(a, b) a = cm[((b) + 16)>>5]
1286 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1287
1288 QPEL_MC(0, put_       , _       , op_put)
1289 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1290 QPEL_MC(0, avg_       , _       , op_avg)
1291 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1292 #undef op_avg
1293 #undef op_avg_no_rnd
1294 #undef op_put
1295 #undef op_put_no_rnd
1296
1297 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1298 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1299 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1300 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1301 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1302 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1303
1304 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1305     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1306     int i;
1307
1308     for(i=0; i<h; i++){
1309         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1310         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1311         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1312         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1313         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1314         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1315         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1316         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1317         dst+=dstStride;
1318         src+=srcStride;
1319     }
1320 }
1321
1322 #if CONFIG_RV40_DECODER
1323 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1324     put_pixels16_xy2_8_c(dst, src, stride, 16);
1325 }
1326 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1327     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1328 }
1329 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1330     put_pixels8_xy2_8_c(dst, src, stride, 8);
1331 }
1332 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1333     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1334 }
1335 #endif /* CONFIG_RV40_DECODER */
1336
1337 #if CONFIG_DIRAC_DECODER
1338 #define DIRAC_MC(OPNAME)\
1339 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1340 {\
1341      OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1342 }\
1343 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1344 {\
1345     OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1346 }\
1347 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1348 {\
1349     OPNAME ## _pixels16_8_c(dst   , src[0]   , stride, h);\
1350     OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1351 }\
1352 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1353 {\
1354     OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1355 }\
1356 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1357 {\
1358     OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1359 }\
1360 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1361 {\
1362     OPNAME ## _pixels16_l2_8(dst   , src[0]   , src[1]   , stride, stride, stride, h);\
1363     OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1364 }\
1365 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1366 {\
1367     OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1368 }\
1369 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1370 {\
1371     OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1372 }\
1373 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1374 {\
1375     OPNAME ## _pixels16_l4_8(dst   , src[0]   , src[1]   , src[2]   , src[3]   , stride, stride, stride, stride, stride, h);\
1376     OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1377 }
1378 DIRAC_MC(put)
1379 DIRAC_MC(avg)
1380 #endif
1381
1382 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1383     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1384     int i;
1385
1386     for(i=0; i<w; i++){
1387         const int src_1= src[ -srcStride];
1388         const int src0 = src[0          ];
1389         const int src1 = src[  srcStride];
1390         const int src2 = src[2*srcStride];
1391         const int src3 = src[3*srcStride];
1392         const int src4 = src[4*srcStride];
1393         const int src5 = src[5*srcStride];
1394         const int src6 = src[6*srcStride];
1395         const int src7 = src[7*srcStride];
1396         const int src8 = src[8*srcStride];
1397         const int src9 = src[9*srcStride];
1398         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1399         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1400         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1401         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1402         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1403         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1404         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1405         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1406         src++;
1407         dst++;
1408     }
1409 }
1410
1411 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1412     uint8_t half[64];
1413     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1414     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1415 }
1416
1417 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1418     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1419 }
1420
1421 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1422     uint8_t half[64];
1423     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1424     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1425 }
1426
1427 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1428     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1429 }
1430
1431 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1432     uint8_t halfH[88];
1433     uint8_t halfV[64];
1434     uint8_t halfHV[64];
1435     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1436     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1437     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1438     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1439 }
1440 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1441     uint8_t halfH[88];
1442     uint8_t halfV[64];
1443     uint8_t halfHV[64];
1444     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1445     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1446     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1447     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1448 }
1449 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1450     uint8_t halfH[88];
1451     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1452     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1453 }
1454
1455 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1456     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1457     int x;
1458     const int strength= ff_h263_loop_filter_strength[qscale];
1459
1460     for(x=0; x<8; x++){
1461         int d1, d2, ad1;
1462         int p0= src[x-2*stride];
1463         int p1= src[x-1*stride];
1464         int p2= src[x+0*stride];
1465         int p3= src[x+1*stride];
1466         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1467
1468         if     (d<-2*strength) d1= 0;
1469         else if(d<-  strength) d1=-2*strength - d;
1470         else if(d<   strength) d1= d;
1471         else if(d< 2*strength) d1= 2*strength - d;
1472         else                   d1= 0;
1473
1474         p1 += d1;
1475         p2 -= d1;
1476         if(p1&256) p1= ~(p1>>31);
1477         if(p2&256) p2= ~(p2>>31);
1478
1479         src[x-1*stride] = p1;
1480         src[x+0*stride] = p2;
1481
1482         ad1= FFABS(d1)>>1;
1483
1484         d2= av_clip((p0-p3)/4, -ad1, ad1);
1485
1486         src[x-2*stride] = p0 - d2;
1487         src[x+  stride] = p3 + d2;
1488     }
1489     }
1490 }
1491
1492 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1493     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1494     int y;
1495     const int strength= ff_h263_loop_filter_strength[qscale];
1496
1497     for(y=0; y<8; y++){
1498         int d1, d2, ad1;
1499         int p0= src[y*stride-2];
1500         int p1= src[y*stride-1];
1501         int p2= src[y*stride+0];
1502         int p3= src[y*stride+1];
1503         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1504
1505         if     (d<-2*strength) d1= 0;
1506         else if(d<-  strength) d1=-2*strength - d;
1507         else if(d<   strength) d1= d;
1508         else if(d< 2*strength) d1= 2*strength - d;
1509         else                   d1= 0;
1510
1511         p1 += d1;
1512         p2 -= d1;
1513         if(p1&256) p1= ~(p1>>31);
1514         if(p2&256) p2= ~(p2>>31);
1515
1516         src[y*stride-1] = p1;
1517         src[y*stride+0] = p2;
1518
1519         ad1= FFABS(d1)>>1;
1520
1521         d2= av_clip((p0-p3)/4, -ad1, ad1);
1522
1523         src[y*stride-2] = p0 - d2;
1524         src[y*stride+1] = p3 + d2;
1525     }
1526     }
1527 }
1528
1529 static void h261_loop_filter_c(uint8_t *src, int stride){
1530     int x,y,xy,yz;
1531     int temp[64];
1532
1533     for(x=0; x<8; x++){
1534         temp[x      ] = 4*src[x           ];
1535         temp[x + 7*8] = 4*src[x + 7*stride];
1536     }
1537     for(y=1; y<7; y++){
1538         for(x=0; x<8; x++){
1539             xy = y * stride + x;
1540             yz = y * 8 + x;
1541             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1542         }
1543     }
1544
1545     for(y=0; y<8; y++){
1546         src[  y*stride] = (temp[  y*8] + 2)>>2;
1547         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1548         for(x=1; x<7; x++){
1549             xy = y * stride + x;
1550             yz = y * 8 + x;
1551             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1552         }
1553     }
1554 }
1555
1556 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1557 {
1558     int s, i;
1559
1560     s = 0;
1561     for(i=0;i<h;i++) {
1562         s += abs(pix1[0] - pix2[0]);
1563         s += abs(pix1[1] - pix2[1]);
1564         s += abs(pix1[2] - pix2[2]);
1565         s += abs(pix1[3] - pix2[3]);
1566         s += abs(pix1[4] - pix2[4]);
1567         s += abs(pix1[5] - pix2[5]);
1568         s += abs(pix1[6] - pix2[6]);
1569         s += abs(pix1[7] - pix2[7]);
1570         s += abs(pix1[8] - pix2[8]);
1571         s += abs(pix1[9] - pix2[9]);
1572         s += abs(pix1[10] - pix2[10]);
1573         s += abs(pix1[11] - pix2[11]);
1574         s += abs(pix1[12] - pix2[12]);
1575         s += abs(pix1[13] - pix2[13]);
1576         s += abs(pix1[14] - pix2[14]);
1577         s += abs(pix1[15] - pix2[15]);
1578         pix1 += line_size;
1579         pix2 += line_size;
1580     }
1581     return s;
1582 }
1583
1584 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1585 {
1586     int s, i;
1587
1588     s = 0;
1589     for(i=0;i<h;i++) {
1590         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1591         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1592         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1593         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1594         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1595         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1596         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1597         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1598         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1599         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1600         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1601         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1602         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1603         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1604         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1605         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1606         pix1 += line_size;
1607         pix2 += line_size;
1608     }
1609     return s;
1610 }
1611
1612 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1613 {
1614     int s, i;
1615     uint8_t *pix3 = pix2 + line_size;
1616
1617     s = 0;
1618     for(i=0;i<h;i++) {
1619         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1620         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1621         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1622         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1623         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1624         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1625         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1626         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1627         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1628         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1629         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1630         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1631         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1632         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1633         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1634         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1635         pix1 += line_size;
1636         pix2 += line_size;
1637         pix3 += line_size;
1638     }
1639     return s;
1640 }
1641
1642 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1643 {
1644     int s, i;
1645     uint8_t *pix3 = pix2 + line_size;
1646
1647     s = 0;
1648     for(i=0;i<h;i++) {
1649         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1650         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1651         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1652         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1653         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1654         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1655         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1656         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1657         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1658         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1659         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1660         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1661         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1662         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1663         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1664         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1665         pix1 += line_size;
1666         pix2 += line_size;
1667         pix3 += line_size;
1668     }
1669     return s;
1670 }
1671
1672 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1673 {
1674     int s, i;
1675
1676     s = 0;
1677     for(i=0;i<h;i++) {
1678         s += abs(pix1[0] - pix2[0]);
1679         s += abs(pix1[1] - pix2[1]);
1680         s += abs(pix1[2] - pix2[2]);
1681         s += abs(pix1[3] - pix2[3]);
1682         s += abs(pix1[4] - pix2[4]);
1683         s += abs(pix1[5] - pix2[5]);
1684         s += abs(pix1[6] - pix2[6]);
1685         s += abs(pix1[7] - pix2[7]);
1686         pix1 += line_size;
1687         pix2 += line_size;
1688     }
1689     return s;
1690 }
1691
1692 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1693 {
1694     int s, i;
1695
1696     s = 0;
1697     for(i=0;i<h;i++) {
1698         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1699         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1700         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1701         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1702         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1703         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1704         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1705         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1706         pix1 += line_size;
1707         pix2 += line_size;
1708     }
1709     return s;
1710 }
1711
1712 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1713 {
1714     int s, i;
1715     uint8_t *pix3 = pix2 + line_size;
1716
1717     s = 0;
1718     for(i=0;i<h;i++) {
1719         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1720         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1721         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1722         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1723         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1724         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1725         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1726         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1727         pix1 += line_size;
1728         pix2 += line_size;
1729         pix3 += line_size;
1730     }
1731     return s;
1732 }
1733
1734 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1735 {
1736     int s, i;
1737     uint8_t *pix3 = pix2 + line_size;
1738
1739     s = 0;
1740     for(i=0;i<h;i++) {
1741         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1742         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1743         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1744         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1745         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1746         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1747         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1748         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1749         pix1 += line_size;
1750         pix2 += line_size;
1751         pix3 += line_size;
1752     }
1753     return s;
1754 }
1755
1756 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1757     MpegEncContext *c = v;
1758     int score1=0;
1759     int score2=0;
1760     int x,y;
1761
1762     for(y=0; y<h; y++){
1763         for(x=0; x<16; x++){
1764             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1765         }
1766         if(y+1<h){
1767             for(x=0; x<15; x++){
1768                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1769                              - s1[x+1] + s1[x+1+stride])
1770                         -FFABS(  s2[x  ] - s2[x  +stride]
1771                              - s2[x+1] + s2[x+1+stride]);
1772             }
1773         }
1774         s1+= stride;
1775         s2+= stride;
1776     }
1777
1778     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1779     else  return score1 + FFABS(score2)*8;
1780 }
1781
1782 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1783     MpegEncContext *c = v;
1784     int score1=0;
1785     int score2=0;
1786     int x,y;
1787
1788     for(y=0; y<h; y++){
1789         for(x=0; x<8; x++){
1790             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1791         }
1792         if(y+1<h){
1793             for(x=0; x<7; x++){
1794                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1795                              - s1[x+1] + s1[x+1+stride])
1796                         -FFABS(  s2[x  ] - s2[x  +stride]
1797                              - s2[x+1] + s2[x+1+stride]);
1798             }
1799         }
1800         s1+= stride;
1801         s2+= stride;
1802     }
1803
1804     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1805     else  return score1 + FFABS(score2)*8;
1806 }
1807
1808 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1809     int i;
1810     unsigned int sum=0;
1811
1812     for(i=0; i<8*8; i++){
1813         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1814         int w= weight[i];
1815         b>>= RECON_SHIFT;
1816         assert(-512<b && b<512);
1817
1818         sum += (w*b)*(w*b)>>4;
1819     }
1820     return sum>>2;
1821 }
1822
1823 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1824     int i;
1825
1826     for(i=0; i<8*8; i++){
1827         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1828     }
1829 }
1830
1831 /**
1832  * Permute an 8x8 block.
1833  * @param block the block which will be permuted according to the given permutation vector
1834  * @param permutation the permutation vector
1835  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1836  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1837  *                  (inverse) permutated to scantable order!
1838  */
1839 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1840 {
1841     int i;
1842     DCTELEM temp[64];
1843
1844     if(last<=0) return;
1845     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1846
1847     for(i=0; i<=last; i++){
1848         const int j= scantable[i];
1849         temp[j]= block[j];
1850         block[j]=0;
1851     }
1852
1853     for(i=0; i<=last; i++){
1854         const int j= scantable[i];
1855         const int perm_j= permutation[j];
1856         block[perm_j]= temp[j];
1857     }
1858 }
1859
1860 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1861     return 0;
1862 }
1863
1864 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1865     int i;
1866
1867     memset(cmp, 0, sizeof(void*)*6);
1868
1869     for(i=0; i<6; i++){
1870         switch(type&0xFF){
1871         case FF_CMP_SAD:
1872             cmp[i]= c->sad[i];
1873             break;
1874         case FF_CMP_SATD:
1875             cmp[i]= c->hadamard8_diff[i];
1876             break;
1877         case FF_CMP_SSE:
1878             cmp[i]= c->sse[i];
1879             break;
1880         case FF_CMP_DCT:
1881             cmp[i]= c->dct_sad[i];
1882             break;
1883         case FF_CMP_DCT264:
1884             cmp[i]= c->dct264_sad[i];
1885             break;
1886         case FF_CMP_DCTMAX:
1887             cmp[i]= c->dct_max[i];
1888             break;
1889         case FF_CMP_PSNR:
1890             cmp[i]= c->quant_psnr[i];
1891             break;
1892         case FF_CMP_BIT:
1893             cmp[i]= c->bit[i];
1894             break;
1895         case FF_CMP_RD:
1896             cmp[i]= c->rd[i];
1897             break;
1898         case FF_CMP_VSAD:
1899             cmp[i]= c->vsad[i];
1900             break;
1901         case FF_CMP_VSSE:
1902             cmp[i]= c->vsse[i];
1903             break;
1904         case FF_CMP_ZERO:
1905             cmp[i]= zero_cmp;
1906             break;
1907         case FF_CMP_NSSE:
1908             cmp[i]= c->nsse[i];
1909             break;
1910 #if CONFIG_DWT
1911         case FF_CMP_W53:
1912             cmp[i]= c->w53[i];
1913             break;
1914         case FF_CMP_W97:
1915             cmp[i]= c->w97[i];
1916             break;
1917 #endif
1918         default:
1919             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1920         }
1921     }
1922 }
1923
1924 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1925     long i;
1926     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1927         long a = *(long*)(src+i);
1928         long b = *(long*)(dst+i);
1929         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1930     }
1931     for(; i<w; i++)
1932         dst[i+0] += src[i+0];
1933 }
1934
1935 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1936     long i;
1937 #if !HAVE_FAST_UNALIGNED
1938     if((long)src2 & (sizeof(long)-1)){
1939         for(i=0; i+7<w; i+=8){
1940             dst[i+0] = src1[i+0]-src2[i+0];
1941             dst[i+1] = src1[i+1]-src2[i+1];
1942             dst[i+2] = src1[i+2]-src2[i+2];
1943             dst[i+3] = src1[i+3]-src2[i+3];
1944             dst[i+4] = src1[i+4]-src2[i+4];
1945             dst[i+5] = src1[i+5]-src2[i+5];
1946             dst[i+6] = src1[i+6]-src2[i+6];
1947             dst[i+7] = src1[i+7]-src2[i+7];
1948         }
1949     }else
1950 #endif
1951     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1952         long a = *(long*)(src1+i);
1953         long b = *(long*)(src2+i);
1954         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1955     }
1956     for(; i<w; i++)
1957         dst[i+0] = src1[i+0]-src2[i+0];
1958 }
1959
1960 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1961     int i;
1962     uint8_t l, lt;
1963
1964     l= *left;
1965     lt= *left_top;
1966
1967     for(i=0; i<w; i++){
1968         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1969         lt= src1[i];
1970         dst[i]= l;
1971     }
1972
1973     *left= l;
1974     *left_top= lt;
1975 }
1976
1977 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1978     int i;
1979     uint8_t l, lt;
1980
1981     l= *left;
1982     lt= *left_top;
1983
1984     for(i=0; i<w; i++){
1985         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1986         lt= src1[i];
1987         l= src2[i];
1988         dst[i]= l - pred;
1989     }
1990
1991     *left= l;
1992     *left_top= lt;
1993 }
1994
1995 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1996     int i;
1997
1998     for(i=0; i<w-1; i++){
1999         acc+= src[i];
2000         dst[i]= acc;
2001         i++;
2002         acc+= src[i];
2003         dst[i]= acc;
2004     }
2005
2006     for(; i<w; i++){
2007         acc+= src[i];
2008         dst[i]= acc;
2009     }
2010
2011     return acc;
2012 }
2013
2014 #if HAVE_BIGENDIAN
2015 #define B 3
2016 #define G 2
2017 #define R 1
2018 #define A 0
2019 #else
2020 #define B 0
2021 #define G 1
2022 #define R 2
2023 #define A 3
2024 #endif
2025 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2026     int i;
2027     int r,g,b,a;
2028     r= *red;
2029     g= *green;
2030     b= *blue;
2031     a= *alpha;
2032
2033     for(i=0; i<w; i++){
2034         b+= src[4*i+B];
2035         g+= src[4*i+G];
2036         r+= src[4*i+R];
2037         a+= src[4*i+A];
2038
2039         dst[4*i+B]= b;
2040         dst[4*i+G]= g;
2041         dst[4*i+R]= r;
2042         dst[4*i+A]= a;
2043     }
2044
2045     *red= r;
2046     *green= g;
2047     *blue= b;
2048     *alpha= a;
2049 }
2050 #undef B
2051 #undef G
2052 #undef R
2053 #undef A
2054
2055 #define BUTTERFLY2(o1,o2,i1,i2) \
2056 o1= (i1)+(i2);\
2057 o2= (i1)-(i2);
2058
2059 #define BUTTERFLY1(x,y) \
2060 {\
2061     int a,b;\
2062     a= x;\
2063     b= y;\
2064     x= a+b;\
2065     y= a-b;\
2066 }
2067
2068 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2069
2070 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2071     int i;
2072     int temp[64];
2073     int sum=0;
2074
2075     assert(h==8);
2076
2077     for(i=0; i<8; i++){
2078         //FIXME try pointer walks
2079         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2080         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2081         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2082         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2083
2084         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2085         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2086         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2087         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2088
2089         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2090         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2091         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2092         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2093     }
2094
2095     for(i=0; i<8; i++){
2096         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2097         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2098         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2099         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2100
2101         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2102         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2103         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2104         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2105
2106         sum +=
2107              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2108             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2109             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2110             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2111     }
2112     return sum;
2113 }
2114
2115 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2116     int i;
2117     int temp[64];
2118     int sum=0;
2119
2120     assert(h==8);
2121
2122     for(i=0; i<8; i++){
2123         //FIXME try pointer walks
2124         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2125         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2126         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2127         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2128
2129         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2130         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2131         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2132         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2133
2134         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2135         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2136         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2137         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2138     }
2139
2140     for(i=0; i<8; i++){
2141         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2142         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2143         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2144         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2145
2146         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2147         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2148         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2149         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2150
2151         sum +=
2152              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2153             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2154             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2155             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2156     }
2157
2158     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2159
2160     return sum;
2161 }
2162
2163 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2164     MpegEncContext * const s= (MpegEncContext *)c;
2165     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2166
2167     assert(h==8);
2168
2169     s->dsp.diff_pixels(temp, src1, src2, stride);
2170     s->dsp.fdct(temp);
2171     return s->dsp.sum_abs_dctelem(temp);
2172 }
2173
2174 #if CONFIG_GPL
2175 #define DCT8_1D {\
2176     const int s07 = SRC(0) + SRC(7);\
2177     const int s16 = SRC(1) + SRC(6);\
2178     const int s25 = SRC(2) + SRC(5);\
2179     const int s34 = SRC(3) + SRC(4);\
2180     const int a0 = s07 + s34;\
2181     const int a1 = s16 + s25;\
2182     const int a2 = s07 - s34;\
2183     const int a3 = s16 - s25;\
2184     const int d07 = SRC(0) - SRC(7);\
2185     const int d16 = SRC(1) - SRC(6);\
2186     const int d25 = SRC(2) - SRC(5);\
2187     const int d34 = SRC(3) - SRC(4);\
2188     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2189     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2190     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2191     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2192     DST(0,  a0 + a1     ) ;\
2193     DST(1,  a4 + (a7>>2)) ;\
2194     DST(2,  a2 + (a3>>1)) ;\
2195     DST(3,  a5 + (a6>>2)) ;\
2196     DST(4,  a0 - a1     ) ;\
2197     DST(5,  a6 - (a5>>2)) ;\
2198     DST(6, (a2>>1) - a3 ) ;\
2199     DST(7, (a4>>2) - a7 ) ;\
2200 }
2201
2202 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2203     MpegEncContext * const s= (MpegEncContext *)c;
2204     DCTELEM dct[8][8];
2205     int i;
2206     int sum=0;
2207
2208     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2209
2210 #define SRC(x) dct[i][x]
2211 #define DST(x,v) dct[i][x]= v
2212     for( i = 0; i < 8; i++ )
2213         DCT8_1D
2214 #undef SRC
2215 #undef DST
2216
2217 #define SRC(x) dct[x][i]
2218 #define DST(x,v) sum += FFABS(v)
2219     for( i = 0; i < 8; i++ )
2220         DCT8_1D
2221 #undef SRC
2222 #undef DST
2223     return sum;
2224 }
2225 #endif
2226
2227 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2228     MpegEncContext * const s= (MpegEncContext *)c;
2229     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2230     int sum=0, i;
2231
2232     assert(h==8);
2233
2234     s->dsp.diff_pixels(temp, src1, src2, stride);
2235     s->dsp.fdct(temp);
2236
2237     for(i=0; i<64; i++)
2238         sum= FFMAX(sum, FFABS(temp[i]));
2239
2240     return sum;
2241 }
2242
2243 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2244     MpegEncContext * const s= (MpegEncContext *)c;
2245     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2246     DCTELEM * const bak = temp+64;
2247     int sum=0, i;
2248
2249     assert(h==8);
2250     s->mb_intra=0;
2251
2252     s->dsp.diff_pixels(temp, src1, src2, stride);
2253
2254     memcpy(bak, temp, 64*sizeof(DCTELEM));
2255
2256     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2257     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2258     ff_simple_idct_8(temp); //FIXME
2259
2260     for(i=0; i<64; i++)
2261         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2262
2263     return sum;
2264 }
2265
2266 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2267     MpegEncContext * const s= (MpegEncContext *)c;
2268     const uint8_t *scantable= s->intra_scantable.permutated;
2269     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2270     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2271     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2272     int i, last, run, bits, level, distortion, start_i;
2273     const int esc_length= s->ac_esc_length;
2274     uint8_t * length;
2275     uint8_t * last_length;
2276
2277     assert(h==8);
2278
2279     copy_block8(lsrc1, src1, 8, stride, 8);
2280     copy_block8(lsrc2, src2, 8, stride, 8);
2281
2282     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2283
2284     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2285
2286     bits=0;
2287
2288     if (s->mb_intra) {
2289         start_i = 1;
2290         length     = s->intra_ac_vlc_length;
2291         last_length= s->intra_ac_vlc_last_length;
2292         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2293     } else {
2294         start_i = 0;
2295         length     = s->inter_ac_vlc_length;
2296         last_length= s->inter_ac_vlc_last_length;
2297     }
2298
2299     if(last>=start_i){
2300         run=0;
2301         for(i=start_i; i<last; i++){
2302             int j= scantable[i];
2303             level= temp[j];
2304
2305             if(level){
2306                 level+=64;
2307                 if((level&(~127)) == 0){
2308                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2309                 }else
2310                     bits+= esc_length;
2311                 run=0;
2312             }else
2313                 run++;
2314         }
2315         i= scantable[last];
2316
2317         level= temp[i] + 64;
2318
2319         assert(level - 64);
2320
2321         if((level&(~127)) == 0){
2322             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2323         }else
2324             bits+= esc_length;
2325
2326     }
2327
2328     if(last>=0){
2329         if(s->mb_intra)
2330             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2331         else
2332             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2333     }
2334
2335     s->dsp.idct_add(lsrc2, 8, temp);
2336
2337     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2338
2339     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2340 }
2341
2342 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2343     MpegEncContext * const s= (MpegEncContext *)c;
2344     const uint8_t *scantable= s->intra_scantable.permutated;
2345     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2346     int i, last, run, bits, level, start_i;
2347     const int esc_length= s->ac_esc_length;
2348     uint8_t * length;
2349     uint8_t * last_length;
2350
2351     assert(h==8);
2352
2353     s->dsp.diff_pixels(temp, src1, src2, stride);
2354
2355     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2356
2357     bits=0;
2358
2359     if (s->mb_intra) {
2360         start_i = 1;
2361         length     = s->intra_ac_vlc_length;
2362         last_length= s->intra_ac_vlc_last_length;
2363         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2364     } else {
2365         start_i = 0;
2366         length     = s->inter_ac_vlc_length;
2367         last_length= s->inter_ac_vlc_last_length;
2368     }
2369
2370     if(last>=start_i){
2371         run=0;
2372         for(i=start_i; i<last; i++){
2373             int j= scantable[i];
2374             level= temp[j];
2375
2376             if(level){
2377                 level+=64;
2378                 if((level&(~127)) == 0){
2379                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2380                 }else
2381                     bits+= esc_length;
2382                 run=0;
2383             }else
2384                 run++;
2385         }
2386         i= scantable[last];
2387
2388         level= temp[i] + 64;
2389
2390         assert(level - 64);
2391
2392         if((level&(~127)) == 0){
2393             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2394         }else
2395             bits+= esc_length;
2396     }
2397
2398     return bits;
2399 }
2400
2401 #define VSAD_INTRA(size) \
2402 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2403     int score=0;                                                                                            \
2404     int x,y;                                                                                                \
2405                                                                                                             \
2406     for(y=1; y<h; y++){                                                                                     \
2407         for(x=0; x<size; x+=4){                                                                             \
2408             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2409                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2410         }                                                                                                   \
2411         s+= stride;                                                                                         \
2412     }                                                                                                       \
2413                                                                                                             \
2414     return score;                                                                                           \
2415 }
2416 VSAD_INTRA(8)
2417 VSAD_INTRA(16)
2418
2419 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2420     int score=0;
2421     int x,y;
2422
2423     for(y=1; y<h; y++){
2424         for(x=0; x<16; x++){
2425             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2426         }
2427         s1+= stride;
2428         s2+= stride;
2429     }
2430
2431     return score;
2432 }
2433
2434 #define SQ(a) ((a)*(a))
2435 #define VSSE_INTRA(size) \
2436 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2437     int score=0;                                                                                            \
2438     int x,y;                                                                                                \
2439                                                                                                             \
2440     for(y=1; y<h; y++){                                                                                     \
2441         for(x=0; x<size; x+=4){                                                                               \
2442             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2443                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2444         }                                                                                                   \
2445         s+= stride;                                                                                         \
2446     }                                                                                                       \
2447                                                                                                             \
2448     return score;                                                                                           \
2449 }
2450 VSSE_INTRA(8)
2451 VSSE_INTRA(16)
2452
2453 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2454     int score=0;
2455     int x,y;
2456
2457     for(y=1; y<h; y++){
2458         for(x=0; x<16; x++){
2459             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2460         }
2461         s1+= stride;
2462         s2+= stride;
2463     }
2464
2465     return score;
2466 }
2467
2468 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2469                                int size){
2470     int score=0;
2471     int i;
2472     for(i=0; i<size; i++)
2473         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2474     return score;
2475 }
2476
2477 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2478 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2479 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2480 #if CONFIG_GPL
2481 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2482 #endif
2483 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2484 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2485 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2486 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2487
2488 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2489     int i;
2490     src1 += len-1;
2491     for(i=0; i<len; i++)
2492         dst[i] = src0[i] * src1[-i];
2493 }
2494
2495 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2496     int i;
2497     for(i=0; i<len; i++)
2498         dst[i] = src0[i] * src1[i] + src2[i];
2499 }
2500
2501 static void vector_fmul_window_c(float *dst, const float *src0,
2502                                  const float *src1, const float *win, int len)
2503 {
2504     int i,j;
2505     dst += len;
2506     win += len;
2507     src0+= len;
2508     for(i=-len, j=len-1; i<0; i++, j--) {
2509         float s0 = src0[i];
2510         float s1 = src1[j];
2511         float wi = win[i];
2512         float wj = win[j];
2513         dst[i] = s0*wj - s1*wi;
2514         dst[j] = s0*wi + s1*wj;
2515     }
2516 }
2517
2518 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2519                                  int len)
2520 {
2521     int i;
2522     for (i = 0; i < len; i++)
2523         dst[i] = src[i] * mul;
2524 }
2525
2526 static void butterflies_float_c(float *av_restrict v1, float *av_restrict v2,
2527                                 int len)
2528 {
2529     int i;
2530     for (i = 0; i < len; i++) {
2531         float t = v1[i] - v2[i];
2532         v1[i] += v2[i];
2533         v2[i] = t;
2534     }
2535 }
2536
2537 static void butterflies_float_interleave_c(float *dst, const float *src0,
2538                                            const float *src1, int len)
2539 {
2540     int i;
2541     for (i = 0; i < len; i++) {
2542         float f1 = src0[i];
2543         float f2 = src1[i];
2544         dst[2*i    ] = f1 + f2;
2545         dst[2*i + 1] = f1 - f2;
2546     }
2547 }
2548
2549 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2550 {
2551     float p = 0.0;
2552     int i;
2553
2554     for (i = 0; i < len; i++)
2555         p += v1[i] * v2[i];
2556
2557     return p;
2558 }
2559
2560 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2561                    uint32_t maxi, uint32_t maxisign)
2562 {
2563
2564     if(a > mini) return mini;
2565     else if((a^(1U<<31)) > maxisign) return maxi;
2566     else return a;
2567 }
2568
2569 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2570     int i;
2571     uint32_t mini = *(uint32_t*)min;
2572     uint32_t maxi = *(uint32_t*)max;
2573     uint32_t maxisign = maxi ^ (1U<<31);
2574     uint32_t *dsti = (uint32_t*)dst;
2575     const uint32_t *srci = (const uint32_t*)src;
2576     for(i=0; i<len; i+=8) {
2577         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2578         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2579         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2580         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2581         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2582         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2583         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2584         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2585     }
2586 }
2587 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2588     int i;
2589     if(min < 0 && max > 0) {
2590         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2591     } else {
2592         for(i=0; i < len; i+=8) {
2593             dst[i    ] = av_clipf(src[i    ], min, max);
2594             dst[i + 1] = av_clipf(src[i + 1], min, max);
2595             dst[i + 2] = av_clipf(src[i + 2], min, max);
2596             dst[i + 3] = av_clipf(src[i + 3], min, max);
2597             dst[i + 4] = av_clipf(src[i + 4], min, max);
2598             dst[i + 5] = av_clipf(src[i + 5], min, max);
2599             dst[i + 6] = av_clipf(src[i + 6], min, max);
2600             dst[i + 7] = av_clipf(src[i + 7], min, max);
2601         }
2602     }
2603 }
2604
2605 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2606 {
2607     int res = 0;
2608
2609     while (order--)
2610         res += *v1++ * *v2++;
2611
2612     return res;
2613 }
2614
2615 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2616 {
2617     int res = 0;
2618     while (order--) {
2619         res   += *v1 * *v2++;
2620         *v1++ += mul * *v3++;
2621     }
2622     return res;
2623 }
2624
2625 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2626                                  const int16_t *window, unsigned int len)
2627 {
2628     int i;
2629     int len2 = len >> 1;
2630
2631     for (i = 0; i < len2; i++) {
2632         int16_t w       = window[i];
2633         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2634         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2635     }
2636 }
2637
2638 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2639                                 int32_t max, unsigned int len)
2640 {
2641     do {
2642         *dst++ = av_clip(*src++, min, max);
2643         *dst++ = av_clip(*src++, min, max);
2644         *dst++ = av_clip(*src++, min, max);
2645         *dst++ = av_clip(*src++, min, max);
2646         *dst++ = av_clip(*src++, min, max);
2647         *dst++ = av_clip(*src++, min, max);
2648         *dst++ = av_clip(*src++, min, max);
2649         *dst++ = av_clip(*src++, min, max);
2650         len -= 8;
2651     } while (len > 0);
2652 }
2653
2654 #define W0 2048
2655 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2656 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2657 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2658 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2659 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2660 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2661 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2662
2663 static void wmv2_idct_row(short * b)
2664 {
2665     int s1,s2;
2666     int a0,a1,a2,a3,a4,a5,a6,a7;
2667     /*step 1*/
2668     a1 = W1*b[1]+W7*b[7];
2669     a7 = W7*b[1]-W1*b[7];
2670     a5 = W5*b[5]+W3*b[3];
2671     a3 = W3*b[5]-W5*b[3];
2672     a2 = W2*b[2]+W6*b[6];
2673     a6 = W6*b[2]-W2*b[6];
2674     a0 = W0*b[0]+W0*b[4];
2675     a4 = W0*b[0]-W0*b[4];
2676     /*step 2*/
2677     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2678     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2679     /*step 3*/
2680     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2681     b[1] = (a4+a6 +s1   + (1<<7))>>8;
2682     b[2] = (a4-a6 +s2   + (1<<7))>>8;
2683     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2684     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2685     b[5] = (a4-a6 -s2   + (1<<7))>>8;
2686     b[6] = (a4+a6 -s1   + (1<<7))>>8;
2687     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2688 }
2689 static void wmv2_idct_col(short * b)
2690 {
2691     int s1,s2;
2692     int a0,a1,a2,a3,a4,a5,a6,a7;
2693     /*step 1, with extended precision*/
2694     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2695     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2696     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2697     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2698     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2699     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2700     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2701     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2702     /*step 2*/
2703     s1 = (181*(a1-a5+a7-a3)+128)>>8;
2704     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2705     /*step 3*/
2706     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2707     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2708     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2709     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2710
2711     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2712     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2713     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2714     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2715 }
2716 void ff_wmv2_idct_c(short * block){
2717     int i;
2718
2719     for(i=0;i<64;i+=8){
2720         wmv2_idct_row(block+i);
2721     }
2722     for(i=0;i<8;i++){
2723         wmv2_idct_col(block+i);
2724     }
2725 }
2726 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2727  converted */
2728 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2729 {
2730     ff_wmv2_idct_c(block);
2731     ff_put_pixels_clamped_c(block, dest, line_size);
2732 }
2733 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2734 {
2735     ff_wmv2_idct_c(block);
2736     ff_add_pixels_clamped_c(block, dest, line_size);
2737 }
2738 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2739 {
2740     ff_j_rev_dct (block);
2741     ff_put_pixels_clamped_c(block, dest, line_size);
2742 }
2743 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2744 {
2745     ff_j_rev_dct (block);
2746     ff_add_pixels_clamped_c(block, dest, line_size);
2747 }
2748
2749 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2750 {
2751     ff_j_rev_dct4 (block);
2752     put_pixels_clamped4_c(block, dest, line_size);
2753 }
2754 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2755 {
2756     ff_j_rev_dct4 (block);
2757     add_pixels_clamped4_c(block, dest, line_size);
2758 }
2759
2760 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2761 {
2762     ff_j_rev_dct2 (block);
2763     put_pixels_clamped2_c(block, dest, line_size);
2764 }
2765 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2766 {
2767     ff_j_rev_dct2 (block);
2768     add_pixels_clamped2_c(block, dest, line_size);
2769 }
2770
2771 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2772 {
2773     dest[0] = av_clip_uint8((block[0] + 4)>>3);
2774 }
2775 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2776 {
2777     dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2778 }
2779
2780 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2781
2782 /* init static data */
2783 av_cold void ff_dsputil_static_init(void)
2784 {
2785     int i;
2786
2787     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2788     for(i=0;i<MAX_NEG_CROP;i++) {
2789         ff_cropTbl[i] = 0;
2790         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2791     }
2792
2793     for(i=0;i<512;i++) {
2794         ff_squareTbl[i] = (i - 256) * (i - 256);
2795     }
2796
2797     for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2798 }
2799
2800 int ff_check_alignment(void){
2801     static int did_fail=0;
2802     LOCAL_ALIGNED_16(int, aligned, [4]);
2803
2804     if((intptr_t)aligned & 15){
2805         if(!did_fail){
2806 #if HAVE_MMX || HAVE_ALTIVEC
2807             av_log(NULL, AV_LOG_ERROR,
2808                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2809                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2810                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2811                 "Do not report crashes to FFmpeg developers.\n");
2812 #endif
2813             did_fail=1;
2814         }
2815         return -1;
2816     }
2817     return 0;
2818 }
2819
2820 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2821 {
2822     int i, j;
2823
2824     ff_check_alignment();
2825
2826 #if CONFIG_ENCODERS
2827     if (avctx->bits_per_raw_sample == 10) {
2828         c->fdct    = ff_jpeg_fdct_islow_10;
2829         c->fdct248 = ff_fdct248_islow_10;
2830     } else {
2831         if(avctx->dct_algo==FF_DCT_FASTINT) {
2832             c->fdct    = ff_fdct_ifast;
2833             c->fdct248 = ff_fdct_ifast248;
2834         }
2835         else if(avctx->dct_algo==FF_DCT_FAAN) {
2836             c->fdct    = ff_faandct;
2837             c->fdct248 = ff_faandct248;
2838         }
2839         else {
2840             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2841             c->fdct248 = ff_fdct248_islow_8;
2842         }
2843     }
2844 #endif //CONFIG_ENCODERS
2845
2846     if(avctx->lowres==1){
2847         c->idct_put= ff_jref_idct4_put;
2848         c->idct_add= ff_jref_idct4_add;
2849         c->idct    = ff_j_rev_dct4;
2850         c->idct_permutation_type= FF_NO_IDCT_PERM;
2851     }else if(avctx->lowres==2){
2852         c->idct_put= ff_jref_idct2_put;
2853         c->idct_add= ff_jref_idct2_add;
2854         c->idct    = ff_j_rev_dct2;
2855         c->idct_permutation_type= FF_NO_IDCT_PERM;
2856     }else if(avctx->lowres==3){
2857         c->idct_put= ff_jref_idct1_put;
2858         c->idct_add= ff_jref_idct1_add;
2859         c->idct    = ff_j_rev_dct1;
2860         c->idct_permutation_type= FF_NO_IDCT_PERM;
2861     }else{
2862         if (avctx->bits_per_raw_sample == 10) {
2863             c->idct_put              = ff_simple_idct_put_10;
2864             c->idct_add              = ff_simple_idct_add_10;
2865             c->idct                  = ff_simple_idct_10;
2866             c->idct_permutation_type = FF_NO_IDCT_PERM;
2867         } else {
2868         if(avctx->idct_algo==FF_IDCT_INT){
2869             c->idct_put= ff_jref_idct_put;
2870             c->idct_add= ff_jref_idct_add;
2871             c->idct    = ff_j_rev_dct;
2872             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2873         }else if(avctx->idct_algo==FF_IDCT_WMV2){
2874             c->idct_put= ff_wmv2_idct_put_c;
2875             c->idct_add= ff_wmv2_idct_add_c;
2876             c->idct    = ff_wmv2_idct_c;
2877             c->idct_permutation_type= FF_NO_IDCT_PERM;
2878         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2879             c->idct_put= ff_faanidct_put;
2880             c->idct_add= ff_faanidct_add;
2881             c->idct    = ff_faanidct;
2882             c->idct_permutation_type= FF_NO_IDCT_PERM;
2883         }else{ //accurate/default
2884             c->idct_put = ff_simple_idct_put_8;
2885             c->idct_add = ff_simple_idct_add_8;
2886             c->idct     = ff_simple_idct_8;
2887             c->idct_permutation_type= FF_NO_IDCT_PERM;
2888         }
2889         }
2890     }
2891
2892     c->diff_pixels = diff_pixels_c;
2893     c->put_pixels_clamped = ff_put_pixels_clamped_c;
2894     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2895     c->add_pixels_clamped = ff_add_pixels_clamped_c;
2896     c->sum_abs_dctelem = sum_abs_dctelem_c;
2897     c->gmc1 = gmc1_c;
2898     c->gmc = ff_gmc_c;
2899     c->pix_sum = pix_sum_c;
2900     c->pix_norm1 = pix_norm1_c;
2901
2902     c->fill_block_tab[0] = fill_block16_c;
2903     c->fill_block_tab[1] = fill_block8_c;
2904
2905     /* TODO [0] 16  [1] 8 */
2906     c->pix_abs[0][0] = pix_abs16_c;
2907     c->pix_abs[0][1] = pix_abs16_x2_c;
2908     c->pix_abs[0][2] = pix_abs16_y2_c;
2909     c->pix_abs[0][3] = pix_abs16_xy2_c;
2910     c->pix_abs[1][0] = pix_abs8_c;
2911     c->pix_abs[1][1] = pix_abs8_x2_c;
2912     c->pix_abs[1][2] = pix_abs8_y2_c;
2913     c->pix_abs[1][3] = pix_abs8_xy2_c;
2914
2915     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2916     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2917     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2918     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2919     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2920     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2921     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2922     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2923     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2924
2925     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2926     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2927     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2928     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2929     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2930     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2931     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2932     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2933     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2934
2935 #define dspfunc(PFX, IDX, NUM) \
2936     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2937     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2938     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2939     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2940     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2941     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2942     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2943     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2944     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2945     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2946     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2947     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2948     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2949     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2950     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2951     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2952
2953     dspfunc(put_qpel, 0, 16);
2954     dspfunc(put_no_rnd_qpel, 0, 16);
2955
2956     dspfunc(avg_qpel, 0, 16);
2957     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2958
2959     dspfunc(put_qpel, 1, 8);
2960     dspfunc(put_no_rnd_qpel, 1, 8);
2961
2962     dspfunc(avg_qpel, 1, 8);
2963     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2964
2965 #undef dspfunc
2966
2967 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2968     ff_mlp_init(c, avctx);
2969 #endif
2970 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2971     ff_intrax8dsp_init(c,avctx);
2972 #endif
2973
2974     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2975     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2976     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2977     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2978     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2979     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2980     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2981     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2982
2983 #define SET_CMP_FUNC(name) \
2984     c->name[0]= name ## 16_c;\
2985     c->name[1]= name ## 8x8_c;
2986
2987     SET_CMP_FUNC(hadamard8_diff)
2988     c->hadamard8_diff[4]= hadamard8_intra16_c;
2989     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2990     SET_CMP_FUNC(dct_sad)
2991     SET_CMP_FUNC(dct_max)
2992 #if CONFIG_GPL
2993     SET_CMP_FUNC(dct264_sad)
2994 #endif
2995     c->sad[0]= pix_abs16_c;
2996     c->sad[1]= pix_abs8_c;
2997     c->sse[0]= sse16_c;
2998     c->sse[1]= sse8_c;
2999     c->sse[2]= sse4_c;
3000     SET_CMP_FUNC(quant_psnr)
3001     SET_CMP_FUNC(rd)
3002     SET_CMP_FUNC(bit)
3003     c->vsad[0]= vsad16_c;
3004     c->vsad[4]= vsad_intra16_c;
3005     c->vsad[5]= vsad_intra8_c;
3006     c->vsse[0]= vsse16_c;
3007     c->vsse[4]= vsse_intra16_c;
3008     c->vsse[5]= vsse_intra8_c;
3009     c->nsse[0]= nsse16_c;
3010     c->nsse[1]= nsse8_c;
3011 #if CONFIG_DWT
3012     ff_dsputil_init_dwt(c);
3013 #endif
3014
3015     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3016
3017     c->add_bytes= add_bytes_c;
3018     c->diff_bytes= diff_bytes_c;
3019     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3020     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3021     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
3022     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3023     c->bswap_buf= bswap_buf;
3024     c->bswap16_buf = bswap16_buf;
3025
3026     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3027         c->h263_h_loop_filter= h263_h_loop_filter_c;
3028         c->h263_v_loop_filter= h263_v_loop_filter_c;
3029     }
3030
3031     c->h261_loop_filter= h261_loop_filter_c;
3032
3033     c->try_8x8basis= try_8x8basis_c;
3034     c->add_8x8basis= add_8x8basis_c;
3035
3036 #if CONFIG_VORBIS_DECODER
3037     c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling;
3038 #endif
3039 #if CONFIG_AC3_DECODER
3040     c->ac3_downmix = ff_ac3_downmix_c;
3041 #endif
3042     c->vector_fmul_reverse = vector_fmul_reverse_c;
3043     c->vector_fmul_add = vector_fmul_add_c;
3044     c->vector_fmul_window = vector_fmul_window_c;
3045     c->vector_clipf = vector_clipf_c;
3046     c->scalarproduct_int16 = scalarproduct_int16_c;
3047     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3048     c->apply_window_int16 = apply_window_int16_c;
3049     c->vector_clip_int32 = vector_clip_int32_c;
3050     c->scalarproduct_float = scalarproduct_float_c;
3051     c->butterflies_float = butterflies_float_c;
3052     c->butterflies_float_interleave = butterflies_float_interleave_c;
3053     c->vector_fmul_scalar = vector_fmul_scalar_c;
3054
3055     c->shrink[0]= av_image_copy_plane;
3056     c->shrink[1]= ff_shrink22;
3057     c->shrink[2]= ff_shrink44;
3058     c->shrink[3]= ff_shrink88;
3059
3060     c->prefetch= just_return;
3061
3062     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3063     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3064
3065 #undef FUNC
3066 #undef FUNCC
3067 #define FUNC(f, depth) f ## _ ## depth
3068 #define FUNCC(f, depth) f ## _ ## depth ## _c
3069
3070 #define dspfunc1(PFX, IDX, NUM, depth)\
3071     c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
3072     c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3073     c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3074     c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3075
3076 #define dspfunc2(PFX, IDX, NUM, depth)\
3077     c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3078     c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3079     c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3080     c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3081     c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3082     c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3083     c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3084     c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3085     c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3086     c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3087     c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3088     c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3089     c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3090     c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3091     c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3092     c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3093
3094
3095 #define BIT_DEPTH_FUNCS(depth, dct)\
3096     c->get_pixels                    = FUNCC(get_pixels   ## dct   , depth);\
3097     c->draw_edges                    = FUNCC(draw_edges            , depth);\
3098     c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
3099     c->clear_block                   = FUNCC(clear_block  ## dct   , depth);\
3100     c->clear_blocks                  = FUNCC(clear_blocks ## dct   , depth);\
3101     c->add_pixels8                   = FUNCC(add_pixels8  ## dct   , depth);\
3102     c->add_pixels4                   = FUNCC(add_pixels4  ## dct   , depth);\
3103     c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
3104     c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3105 \
3106     c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
3107     c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
3108     c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
3109     c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
3110     c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
3111     c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
3112 \
3113     dspfunc1(put       , 0, 16, depth);\
3114     dspfunc1(put       , 1,  8, depth);\
3115     dspfunc1(put       , 2,  4, depth);\
3116     dspfunc1(put       , 3,  2, depth);\
3117     dspfunc1(put_no_rnd, 0, 16, depth);\
3118     dspfunc1(put_no_rnd, 1,  8, depth);\
3119     dspfunc1(avg       , 0, 16, depth);\
3120     dspfunc1(avg       , 1,  8, depth);\
3121     dspfunc1(avg       , 2,  4, depth);\
3122     dspfunc1(avg       , 3,  2, depth);\
3123     dspfunc1(avg_no_rnd, 0, 16, depth);\
3124     dspfunc1(avg_no_rnd, 1,  8, depth);\
3125 \
3126     dspfunc2(put_h264_qpel, 0, 16, depth);\
3127     dspfunc2(put_h264_qpel, 1,  8, depth);\
3128     dspfunc2(put_h264_qpel, 2,  4, depth);\
3129     dspfunc2(put_h264_qpel, 3,  2, depth);\
3130     dspfunc2(avg_h264_qpel, 0, 16, depth);\
3131     dspfunc2(avg_h264_qpel, 1,  8, depth);\
3132     dspfunc2(avg_h264_qpel, 2,  4, depth);
3133
3134     switch (avctx->bits_per_raw_sample) {
3135     case 9:
3136         if (c->dct_bits == 32) {
3137             BIT_DEPTH_FUNCS(9, _32);
3138         } else {
3139             BIT_DEPTH_FUNCS(9, _16);
3140         }
3141         break;
3142     case 10:
3143         if (c->dct_bits == 32) {
3144             BIT_DEPTH_FUNCS(10, _32);
3145         } else {
3146             BIT_DEPTH_FUNCS(10, _16);
3147         }
3148         break;
3149     case 12:
3150         if (c->dct_bits == 32) {
3151             BIT_DEPTH_FUNCS(12, _32);
3152         } else {
3153             BIT_DEPTH_FUNCS(12, _16);
3154         }
3155         break;
3156     case 14:
3157         if (c->dct_bits == 32) {
3158             BIT_DEPTH_FUNCS(14, _32);
3159         } else {
3160             BIT_DEPTH_FUNCS(14, _16);
3161         }
3162         break;
3163     default:
3164         if(avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
3165             BIT_DEPTH_FUNCS(8, _16);
3166         }
3167         break;
3168     }
3169
3170
3171     if (HAVE_MMX)        ff_dsputil_init_mmx   (c, avctx);
3172     if (ARCH_ARM)        ff_dsputil_init_arm   (c, avctx);
3173     if (HAVE_VIS)        ff_dsputil_init_vis   (c, avctx);
3174     if (ARCH_ALPHA)      ff_dsputil_init_alpha (c, avctx);
3175     if (ARCH_PPC)        ff_dsputil_init_ppc   (c, avctx);
3176     if (HAVE_MMI)        ff_dsputil_init_mmi   (c, avctx);
3177     if (ARCH_SH4)        ff_dsputil_init_sh4   (c, avctx);
3178     if (ARCH_BFIN)       ff_dsputil_init_bfin  (c, avctx);
3179
3180     for (i = 0; i < 4; i++) {
3181         for (j = 0; j < 16; j++) {
3182             if(!c->put_2tap_qpel_pixels_tab[i][j])
3183                 c->put_2tap_qpel_pixels_tab[i][j] =
3184                     c->put_h264_qpel_pixels_tab[i][j];
3185             if(!c->avg_2tap_qpel_pixels_tab[i][j])
3186                 c->avg_2tap_qpel_pixels_tab[i][j] =
3187                     c->avg_h264_qpel_pixels_tab[i][j];
3188         }
3189     }
3190
3191     ff_init_scantable_permutation(c->idct_permutation,
3192                                   c->idct_permutation_type);
3193 }
3194
3195 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3196 {
3197     ff_dsputil_init(c, avctx);
3198 }