git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "vorbis.h"
  40 #include "diracdsp.h"
  41
  42 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  43 uint32_t ff_squareTbl[512] = {0, };
  44
  45 #define pixeltmp int16_t
  46 #define BIT_DEPTH 9
  47 #include "dsputil_template.c"
  48 #undef BIT_DEPTH
  49
  50 #define BIT_DEPTH 10
  51 #include "dsputil_template.c"
  52 #undef BIT_DEPTH
  53
  54 #undef pixeltmp
  55 #define pixeltmp int32_t
  56 #define BIT_DEPTH 12
  57 #include "dsputil_template.c"
  58 #undef BIT_DEPTH
  59
  60 #define BIT_DEPTH 14
  61 #include "dsputil_template.c"
  62 #undef BIT_DEPTH
  63
  64 #undef pixeltmp
  65 #define pixeltmp int16_t
  66 #define BIT_DEPTH 8
  67 #include "dsputil_template.c"
  68 #undef pixeltmp
  69
  70 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  71 #define pb_7f (~0UL/255 * 0x7f)
  72 #define pb_80 (~0UL/255 * 0x80)
  73
  74 const uint8_t ff_zigzag_direct[64] = {
  75     0,   1,  8, 16,  9,  2,  3, 10,
  76     17, 24, 32, 25, 18, 11,  4,  5,
  77     12, 19, 26, 33, 40, 48, 41, 34,
  78     27, 20, 13,  6,  7, 14, 21, 28,
  79     35, 42, 49, 56, 57, 50, 43, 36,
  80     29, 22, 15, 23, 30, 37, 44, 51,
  81     58, 59, 52, 45, 38, 31, 39, 46,
  82     53, 60, 61, 54, 47, 55, 62, 63
  83 };
  84
  85 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  86    specification, we interleave the fields */
  87 const uint8_t ff_zigzag248_direct[64] = {
  88      0,  8,  1,  9, 16, 24,  2, 10,
  89     17, 25, 32, 40, 48, 56, 33, 41,
  90     18, 26,  3, 11,  4, 12, 19, 27,
  91     34, 42, 49, 57, 50, 58, 35, 43,
  92     20, 28,  5, 13,  6, 14, 21, 29,
  93     36, 44, 51, 59, 52, 60, 37, 45,
  94     22, 30,  7, 15, 23, 31, 38, 46,
  95     53, 61, 54, 62, 39, 47, 55, 63,
  96 };
  97
  98 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  99 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
 100
 101 const uint8_t ff_alternate_horizontal_scan[64] = {
 102     0,  1,   2,  3,  8,  9, 16, 17,
 103     10, 11,  4,  5,  6,  7, 15, 14,
 104     13, 12, 19, 18, 24, 25, 32, 33,
 105     26, 27, 20, 21, 22, 23, 28, 29,
 106     30, 31, 34, 35, 40, 41, 48, 49,
 107     42, 43, 36, 37, 38, 39, 44, 45,
 108     46, 47, 50, 51, 56, 57, 58, 59,
 109     52, 53, 54, 55, 60, 61, 62, 63,
 110 };
 111
 112 const uint8_t ff_alternate_vertical_scan[64] = {
 113     0,  8,  16, 24,  1,  9,  2, 10,
 114     17, 25, 32, 40, 48, 56, 57, 49,
 115     41, 33, 26, 18,  3, 11,  4, 12,
 116     19, 27, 34, 42, 50, 58, 35, 43,
 117     51, 59, 20, 28,  5, 13,  6, 14,
 118     21, 29, 36, 44, 52, 60, 37, 45,
 119     53, 61, 22, 30,  7, 15, 23, 31,
 120     38, 46, 54, 62, 39, 47, 55, 63,
 121 };
 122
 123 /* Input permutation for the simple_idct_mmx */
 124 static const uint8_t simple_mmx_permutation[64]={
 125         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 126         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 127         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 128         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 129         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 130         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 131         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 132         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 133 };
 134
 135 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 136
 137 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 138     int i;
 139     int end;
 140
 141     st->scantable= src_scantable;
 142
 143     for(i=0; i<64; i++){
 144         int j;
 145         j = src_scantable[i];
 146         st->permutated[i] = permutation[j];
 147     }
 148
 149     end=-1;
 150     for(i=0; i<64; i++){
 151         int j;
 152         j = st->permutated[i];
 153         if(j>end) end=j;
 154         st->raster_end[i]= end;
 155     }
 156 }
 157
 158 void ff_init_scantable_permutation(uint8_t *idct_permutation,
 159                                    int idct_permutation_type)
 160 {
 161     int i;
 162
 163     switch(idct_permutation_type){
 164     case FF_NO_IDCT_PERM:
 165         for(i=0; i<64; i++)
 166             idct_permutation[i]= i;
 167         break;
 168     case FF_LIBMPEG2_IDCT_PERM:
 169         for(i=0; i<64; i++)
 170             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 171         break;
 172     case FF_SIMPLE_IDCT_PERM:
 173         for(i=0; i<64; i++)
 174             idct_permutation[i]= simple_mmx_permutation[i];
 175         break;
 176     case FF_TRANSPOSE_IDCT_PERM:
 177         for(i=0; i<64; i++)
 178             idct_permutation[i]= ((i&7)<<3) | (i>>3);
 179         break;
 180     case FF_PARTTRANS_IDCT_PERM:
 181         for(i=0; i<64; i++)
 182             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
 183         break;
 184     case FF_SSE2_IDCT_PERM:
 185         for(i=0; i<64; i++)
 186             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
 187         break;
 188     default:
 189         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
 190     }
 191 }
 192
 193 static int pix_sum_c(uint8_t * pix, int line_size)
 194 {
 195     int s, i, j;
 196
 197     s = 0;
 198     for (i = 0; i < 16; i++) {
 199         for (j = 0; j < 16; j += 8) {
 200             s += pix[0];
 201             s += pix[1];
 202             s += pix[2];
 203             s += pix[3];
 204             s += pix[4];
 205             s += pix[5];
 206             s += pix[6];
 207             s += pix[7];
 208             pix += 8;
 209         }
 210         pix += line_size - 16;
 211     }
 212     return s;
 213 }
 214
 215 static int pix_norm1_c(uint8_t * pix, int line_size)
 216 {
 217     int s, i, j;
 218     uint32_t *sq = ff_squareTbl + 256;
 219
 220     s = 0;
 221     for (i = 0; i < 16; i++) {
 222         for (j = 0; j < 16; j += 8) {
 223 #if 0
 224             s += sq[pix[0]];
 225             s += sq[pix[1]];
 226             s += sq[pix[2]];
 227             s += sq[pix[3]];
 228             s += sq[pix[4]];
 229             s += sq[pix[5]];
 230             s += sq[pix[6]];
 231             s += sq[pix[7]];
 232 #else
 233 #if HAVE_FAST_64BIT
 234             register uint64_t x=*(uint64_t*)pix;
 235             s += sq[x&0xff];
 236             s += sq[(x>>8)&0xff];
 237             s += sq[(x>>16)&0xff];
 238             s += sq[(x>>24)&0xff];
 239             s += sq[(x>>32)&0xff];
 240             s += sq[(x>>40)&0xff];
 241             s += sq[(x>>48)&0xff];
 242             s += sq[(x>>56)&0xff];
 243 #else
 244             register uint32_t x=*(uint32_t*)pix;
 245             s += sq[x&0xff];
 246             s += sq[(x>>8)&0xff];
 247             s += sq[(x>>16)&0xff];
 248             s += sq[(x>>24)&0xff];
 249             x=*(uint32_t*)(pix+4);
 250             s += sq[x&0xff];
 251             s += sq[(x>>8)&0xff];
 252             s += sq[(x>>16)&0xff];
 253             s += sq[(x>>24)&0xff];
 254 #endif
 255 #endif
 256             pix += 8;
 257         }
 258         pix += line_size - 16;
 259     }
 260     return s;
 261 }
 262
 263 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 264     int i;
 265
 266     for(i=0; i+8<=w; i+=8){
 267         dst[i+0]= av_bswap32(src[i+0]);
 268         dst[i+1]= av_bswap32(src[i+1]);
 269         dst[i+2]= av_bswap32(src[i+2]);
 270         dst[i+3]= av_bswap32(src[i+3]);
 271         dst[i+4]= av_bswap32(src[i+4]);
 272         dst[i+5]= av_bswap32(src[i+5]);
 273         dst[i+6]= av_bswap32(src[i+6]);
 274         dst[i+7]= av_bswap32(src[i+7]);
 275     }
 276     for(;i<w; i++){
 277         dst[i+0]= av_bswap32(src[i+0]);
 278     }
 279 }
 280
 281 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 282 {
 283     while (len--)
 284         *dst++ = av_bswap16(*src++);
 285 }
 286
 287 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 288 {
 289     int s, i;
 290     uint32_t *sq = ff_squareTbl + 256;
 291
 292     s = 0;
 293     for (i = 0; i < h; i++) {
 294         s += sq[pix1[0] - pix2[0]];
 295         s += sq[pix1[1] - pix2[1]];
 296         s += sq[pix1[2] - pix2[2]];
 297         s += sq[pix1[3] - pix2[3]];
 298         pix1 += line_size;
 299         pix2 += line_size;
 300     }
 301     return s;
 302 }
 303
 304 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 305 {
 306     int s, i;
 307     uint32_t *sq = ff_squareTbl + 256;
 308
 309     s = 0;
 310     for (i = 0; i < h; i++) {
 311         s += sq[pix1[0] - pix2[0]];
 312         s += sq[pix1[1] - pix2[1]];
 313         s += sq[pix1[2] - pix2[2]];
 314         s += sq[pix1[3] - pix2[3]];
 315         s += sq[pix1[4] - pix2[4]];
 316         s += sq[pix1[5] - pix2[5]];
 317         s += sq[pix1[6] - pix2[6]];
 318         s += sq[pix1[7] - pix2[7]];
 319         pix1 += line_size;
 320         pix2 += line_size;
 321     }
 322     return s;
 323 }
 324
 325 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 326 {
 327     int s, i;
 328     uint32_t *sq = ff_squareTbl + 256;
 329
 330     s = 0;
 331     for (i = 0; i < h; i++) {
 332         s += sq[pix1[ 0] - pix2[ 0]];
 333         s += sq[pix1[ 1] - pix2[ 1]];
 334         s += sq[pix1[ 2] - pix2[ 2]];
 335         s += sq[pix1[ 3] - pix2[ 3]];
 336         s += sq[pix1[ 4] - pix2[ 4]];
 337         s += sq[pix1[ 5] - pix2[ 5]];
 338         s += sq[pix1[ 6] - pix2[ 6]];
 339         s += sq[pix1[ 7] - pix2[ 7]];
 340         s += sq[pix1[ 8] - pix2[ 8]];
 341         s += sq[pix1[ 9] - pix2[ 9]];
 342         s += sq[pix1[10] - pix2[10]];
 343         s += sq[pix1[11] - pix2[11]];
 344         s += sq[pix1[12] - pix2[12]];
 345         s += sq[pix1[13] - pix2[13]];
 346         s += sq[pix1[14] - pix2[14]];
 347         s += sq[pix1[15] - pix2[15]];
 348
 349         pix1 += line_size;
 350         pix2 += line_size;
 351     }
 352     return s;
 353 }
 354
 355 static void diff_pixels_c(DCTELEM *av_restrict block, const uint8_t *s1,
 356                           const uint8_t *s2, int stride){
 357     int i;
 358
 359     /* read the pixels */
 360     for(i=0;i<8;i++) {
 361         block[0] = s1[0] - s2[0];
 362         block[1] = s1[1] - s2[1];
 363         block[2] = s1[2] - s2[2];
 364         block[3] = s1[3] - s2[3];
 365         block[4] = s1[4] - s2[4];
 366         block[5] = s1[5] - s2[5];
 367         block[6] = s1[6] - s2[6];
 368         block[7] = s1[7] - s2[7];
 369         s1 += stride;
 370         s2 += stride;
 371         block += 8;
 372     }
 373 }
 374
 375
 376 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *av_restrict pixels,
 377                                  int line_size)
 378 {
 379     int i;
 380
 381     /* read the pixels */
 382     for(i=0;i<8;i++) {
 383         pixels[0] = av_clip_uint8(block[0]);
 384         pixels[1] = av_clip_uint8(block[1]);
 385         pixels[2] = av_clip_uint8(block[2]);
 386         pixels[3] = av_clip_uint8(block[3]);
 387         pixels[4] = av_clip_uint8(block[4]);
 388         pixels[5] = av_clip_uint8(block[5]);
 389         pixels[6] = av_clip_uint8(block[6]);
 390         pixels[7] = av_clip_uint8(block[7]);
 391
 392         pixels += line_size;
 393         block += 8;
 394     }
 395 }
 396
 397 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *av_restrict pixels,
 398                                  int line_size)
 399 {
 400     int i;
 401
 402     /* read the pixels */
 403     for(i=0;i<4;i++) {
 404         pixels[0] = av_clip_uint8(block[0]);
 405         pixels[1] = av_clip_uint8(block[1]);
 406         pixels[2] = av_clip_uint8(block[2]);
 407         pixels[3] = av_clip_uint8(block[3]);
 408
 409         pixels += line_size;
 410         block += 8;
 411     }
 412 }
 413
 414 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *av_restrict pixels,
 415                                  int line_size)
 416 {
 417     int i;
 418
 419     /* read the pixels */
 420     for(i=0;i<2;i++) {
 421         pixels[0] = av_clip_uint8(block[0]);
 422         pixels[1] = av_clip_uint8(block[1]);
 423
 424         pixels += line_size;
 425         block += 8;
 426     }
 427 }
 428
 429 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 430                                         uint8_t *av_restrict pixels,
 431                                         int line_size)
 432 {
 433     int i, j;
 434
 435     for (i = 0; i < 8; i++) {
 436         for (j = 0; j < 8; j++) {
 437             if (*block < -128)
 438                 *pixels = 0;
 439             else if (*block > 127)
 440                 *pixels = 255;
 441             else
 442                 *pixels = (uint8_t)(*block + 128);
 443             block++;
 444             pixels++;
 445         }
 446         pixels += (line_size - 8);
 447     }
 448 }
 449
 450 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *av_restrict pixels,
 451                                  int line_size)
 452 {
 453     int i;
 454
 455     /* read the pixels */
 456     for(i=0;i<8;i++) {
 457         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 458         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 459         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 460         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 461         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
 462         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
 463         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
 464         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
 465         pixels += line_size;
 466         block += 8;
 467     }
 468 }
 469
 470 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *av_restrict pixels,
 471                           int line_size)
 472 {
 473     int i;
 474
 475     /* read the pixels */
 476     for(i=0;i<4;i++) {
 477         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 478         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 479         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 480         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 481         pixels += line_size;
 482         block += 8;
 483     }
 484 }
 485
 486 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *av_restrict pixels,
 487                           int line_size)
 488 {
 489     int i;
 490
 491     /* read the pixels */
 492     for(i=0;i<2;i++) {
 493         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 494         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 495         pixels += line_size;
 496         block += 8;
 497     }
 498 }
 499
 500 static int sum_abs_dctelem_c(DCTELEM *block)
 501 {
 502     int sum=0, i;
 503     for(i=0; i<64; i++)
 504         sum+= FFABS(block[i]);
 505     return sum;
 506 }
 507
 508 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 509 {
 510     int i;
 511
 512     for (i = 0; i < h; i++) {
 513         memset(block, value, 16);
 514         block += line_size;
 515     }
 516 }
 517
 518 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 519 {
 520     int i;
 521
 522     for (i = 0; i < h; i++) {
 523         memset(block, value, 8);
 524         block += line_size;
 525     }
 526 }
 527
 528 #define avg2(a,b) ((a+b+1)>>1)
 529 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 530
 531 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 532 {
 533     const int A=(16-x16)*(16-y16);
 534     const int B=(   x16)*(16-y16);
 535     const int C=(16-x16)*(   y16);
 536     const int D=(   x16)*(   y16);
 537     int i;
 538
 539     for(i=0; i<h; i++)
 540     {
 541         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 542         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 543         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 544         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 545         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 546         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 547         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 548         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 549         dst+= stride;
 550         src+= stride;
 551     }
 552 }
 553
 554 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 555                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 556 {
 557     int y, vx, vy;
 558     const int s= 1<<shift;
 559
 560     width--;
 561     height--;
 562
 563     for(y=0; y<h; y++){
 564         int x;
 565
 566         vx= ox;
 567         vy= oy;
 568         for(x=0; x<8; x++){ //XXX FIXME optimize
 569             int src_x, src_y, frac_x, frac_y, index;
 570
 571             src_x= vx>>16;
 572             src_y= vy>>16;
 573             frac_x= src_x&(s-1);
 574             frac_y= src_y&(s-1);
 575             src_x>>=shift;
 576             src_y>>=shift;
 577
 578             if((unsigned)src_x < width){
 579                 if((unsigned)src_y < height){
 580                     index= src_x + src_y*stride;
 581                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 582                                            + src[index       +1]*   frac_x )*(s-frac_y)
 583                                         + (  src[index+stride  ]*(s-frac_x)
 584                                            + src[index+stride+1]*   frac_x )*   frac_y
 585                                         + r)>>(shift*2);
 586                 }else{
 587                     index= src_x + av_clip(src_y, 0, height)*stride;
 588                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 589                                           + src[index       +1]*   frac_x )*s
 590                                         + r)>>(shift*2);
 591                 }
 592             }else{
 593                 if((unsigned)src_y < height){
 594                     index= av_clip(src_x, 0, width) + src_y*stride;
 595                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 596                                            + src[index+stride  ]*   frac_y )*s
 597                                         + r)>>(shift*2);
 598                 }else{
 599                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 600                     dst[y*stride + x]=    src[index         ];
 601                 }
 602             }
 603
 604             vx+= dxx;
 605             vy+= dyx;
 606         }
 607         ox += dxy;
 608         oy += dyy;
 609     }
 610 }
 611
 612 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 613     switch(width){
 614     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 615     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 616     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 617     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 618     }
 619 }
 620
 621 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 622     int i,j;
 623     for (i=0; i < height; i++) {
 624       for (j=0; j < width; j++) {
 625         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 626       }
 627       src += stride;
 628       dst += stride;
 629     }
 630 }
 631
 632 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 633     int i,j;
 634     for (i=0; i < height; i++) {
 635       for (j=0; j < width; j++) {
 636         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 637       }
 638       src += stride;
 639       dst += stride;
 640     }
 641 }
 642
 643 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 644     int i,j;
 645     for (i=0; i < height; i++) {
 646       for (j=0; j < width; j++) {
 647         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 648       }
 649       src += stride;
 650       dst += stride;
 651     }
 652 }
 653
 654 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 655     int i,j;
 656     for (i=0; i < height; i++) {
 657       for (j=0; j < width; j++) {
 658         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 659       }
 660       src += stride;
 661       dst += stride;
 662     }
 663 }
 664
 665 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 666     int i,j;
 667     for (i=0; i < height; i++) {
 668       for (j=0; j < width; j++) {
 669         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 670       }
 671       src += stride;
 672       dst += stride;
 673     }
 674 }
 675
 676 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 677     int i,j;
 678     for (i=0; i < height; i++) {
 679       for (j=0; j < width; j++) {
 680         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 681       }
 682       src += stride;
 683       dst += stride;
 684     }
 685 }
 686
 687 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 688     int i,j;
 689     for (i=0; i < height; i++) {
 690       for (j=0; j < width; j++) {
 691         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 692       }
 693       src += stride;
 694       dst += stride;
 695     }
 696 }
 697
 698 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 699     int i,j;
 700     for (i=0; i < height; i++) {
 701       for (j=0; j < width; j++) {
 702         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 703       }
 704       src += stride;
 705       dst += stride;
 706     }
 707 }
 708
 709 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 710     switch(width){
 711     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 712     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 713     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 714     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 715     }
 716 }
 717
 718 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 719     int i,j;
 720     for (i=0; i < height; i++) {
 721       for (j=0; j < width; j++) {
 722         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 723       }
 724       src += stride;
 725       dst += stride;
 726     }
 727 }
 728
 729 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 730     int i,j;
 731     for (i=0; i < height; i++) {
 732       for (j=0; j < width; j++) {
 733         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 734       }
 735       src += stride;
 736       dst += stride;
 737     }
 738 }
 739
 740 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 741     int i,j;
 742     for (i=0; i < height; i++) {
 743       for (j=0; j < width; j++) {
 744         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 745       }
 746       src += stride;
 747       dst += stride;
 748     }
 749 }
 750
 751 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 752     int i,j;
 753     for (i=0; i < height; i++) {
 754       for (j=0; j < width; j++) {
 755         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 756       }
 757       src += stride;
 758       dst += stride;
 759     }
 760 }
 761
 762 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 763     int i,j;
 764     for (i=0; i < height; i++) {
 765       for (j=0; j < width; j++) {
 766         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 767       }
 768       src += stride;
 769       dst += stride;
 770     }
 771 }
 772
 773 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 774     int i,j;
 775     for (i=0; i < height; i++) {
 776       for (j=0; j < width; j++) {
 777         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 778       }
 779       src += stride;
 780       dst += stride;
 781     }
 782 }
 783
 784 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 785     int i,j;
 786     for (i=0; i < height; i++) {
 787       for (j=0; j < width; j++) {
 788         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 789       }
 790       src += stride;
 791       dst += stride;
 792     }
 793 }
 794
 795 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 796     int i,j;
 797     for (i=0; i < height; i++) {
 798       for (j=0; j < width; j++) {
 799         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 800       }
 801       src += stride;
 802       dst += stride;
 803     }
 804 }
 805
 806 #define QPEL_MC(r, OPNAME, RND, OP) \
 807 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 808     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 809     int i;\
 810     for(i=0; i<h; i++)\
 811     {\
 812         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 813         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 814         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 815         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 816         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 817         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 818         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 819         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 820         dst+=dstStride;\
 821         src+=srcStride;\
 822     }\
 823 }\
 824 \
 825 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 826     const int w=8;\
 827     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 828     int i;\
 829     for(i=0; i<w; i++)\
 830     {\
 831         const int src0= src[0*srcStride];\
 832         const int src1= src[1*srcStride];\
 833         const int src2= src[2*srcStride];\
 834         const int src3= src[3*srcStride];\
 835         const int src4= src[4*srcStride];\
 836         const int src5= src[5*srcStride];\
 837         const int src6= src[6*srcStride];\
 838         const int src7= src[7*srcStride];\
 839         const int src8= src[8*srcStride];\
 840         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 841         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 842         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 843         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 844         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 845         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 846         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 847         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 848         dst++;\
 849         src++;\
 850     }\
 851 }\
 852 \
 853 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 854     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 855     int i;\
 856     \
 857     for(i=0; i<h; i++)\
 858     {\
 859         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 860         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 861         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 862         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 863         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 864         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 865         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 866         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 867         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 868         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 869         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 870         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 871         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 872         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 873         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 874         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 875         dst+=dstStride;\
 876         src+=srcStride;\
 877     }\
 878 }\
 879 \
 880 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 881     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 882     int i;\
 883     const int w=16;\
 884     for(i=0; i<w; i++)\
 885     {\
 886         const int src0= src[0*srcStride];\
 887         const int src1= src[1*srcStride];\
 888         const int src2= src[2*srcStride];\
 889         const int src3= src[3*srcStride];\
 890         const int src4= src[4*srcStride];\
 891         const int src5= src[5*srcStride];\
 892         const int src6= src[6*srcStride];\
 893         const int src7= src[7*srcStride];\
 894         const int src8= src[8*srcStride];\
 895         const int src9= src[9*srcStride];\
 896         const int src10= src[10*srcStride];\
 897         const int src11= src[11*srcStride];\
 898         const int src12= src[12*srcStride];\
 899         const int src13= src[13*srcStride];\
 900         const int src14= src[14*srcStride];\
 901         const int src15= src[15*srcStride];\
 902         const int src16= src[16*srcStride];\
 903         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 904         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 905         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 906         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 907         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 908         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 909         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 910         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 911         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 912         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 913         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 914         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 915         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 916         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 917         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 918         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 919         dst++;\
 920         src++;\
 921     }\
 922 }\
 923 \
 924 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
 925     uint8_t half[64];\
 926     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 927     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 928 }\
 929 \
 930 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
 931     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 932 }\
 933 \
 934 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
 935     uint8_t half[64];\
 936     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 937     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 938 }\
 939 \
 940 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
 941     uint8_t full[16*9];\
 942     uint8_t half[64];\
 943     copy_block9(full, src, 16, stride, 9);\
 944     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 945     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 946 }\
 947 \
 948 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
 949     uint8_t full[16*9];\
 950     copy_block9(full, src, 16, stride, 9);\
 951     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 952 }\
 953 \
 954 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
 955     uint8_t full[16*9];\
 956     uint8_t half[64];\
 957     copy_block9(full, src, 16, stride, 9);\
 958     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 959     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 960 }\
 961 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
 962     uint8_t full[16*9];\
 963     uint8_t halfH[72];\
 964     uint8_t halfV[64];\
 965     uint8_t halfHV[64];\
 966     copy_block9(full, src, 16, stride, 9);\
 967     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 968     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 969     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 970     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 971 }\
 972 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
 973     uint8_t full[16*9];\
 974     uint8_t halfH[72];\
 975     uint8_t halfHV[64];\
 976     copy_block9(full, src, 16, stride, 9);\
 977     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 978     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 979     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 980     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 981 }\
 982 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
 983     uint8_t full[16*9];\
 984     uint8_t halfH[72];\
 985     uint8_t halfV[64];\
 986     uint8_t halfHV[64];\
 987     copy_block9(full, src, 16, stride, 9);\
 988     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 989     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 990     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 991     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 992 }\
 993 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
 994     uint8_t full[16*9];\
 995     uint8_t halfH[72];\
 996     uint8_t halfHV[64];\
 997     copy_block9(full, src, 16, stride, 9);\
 998     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 999     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1000     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1001     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1002 }\
1003 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1004     uint8_t full[16*9];\
1005     uint8_t halfH[72];\
1006     uint8_t halfV[64];\
1007     uint8_t halfHV[64];\
1008     copy_block9(full, src, 16, stride, 9);\
1009     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1010     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1011     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1012     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1013 }\
1014 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1015     uint8_t full[16*9];\
1016     uint8_t halfH[72];\
1017     uint8_t halfHV[64];\
1018     copy_block9(full, src, 16, stride, 9);\
1019     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1020     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1021     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1022     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1023 }\
1024 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1025     uint8_t full[16*9];\
1026     uint8_t halfH[72];\
1027     uint8_t halfV[64];\
1028     uint8_t halfHV[64];\
1029     copy_block9(full, src, 16, stride, 9);\
1030     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1031     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1032     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1033     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1034 }\
1035 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1036     uint8_t full[16*9];\
1037     uint8_t halfH[72];\
1038     uint8_t halfHV[64];\
1039     copy_block9(full, src, 16, stride, 9);\
1040     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1041     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1042     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1043     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1044 }\
1045 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1046     uint8_t halfH[72];\
1047     uint8_t halfHV[64];\
1048     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1049     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1050     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1051 }\
1052 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1053     uint8_t halfH[72];\
1054     uint8_t halfHV[64];\
1055     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1056     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1057     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1058 }\
1059 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1060     uint8_t full[16*9];\
1061     uint8_t halfH[72];\
1062     uint8_t halfV[64];\
1063     uint8_t halfHV[64];\
1064     copy_block9(full, src, 16, stride, 9);\
1065     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1066     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1067     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1068     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1069 }\
1070 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1071     uint8_t full[16*9];\
1072     uint8_t halfH[72];\
1073     copy_block9(full, src, 16, stride, 9);\
1074     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1075     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1076     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1077 }\
1078 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1079     uint8_t full[16*9];\
1080     uint8_t halfH[72];\
1081     uint8_t halfV[64];\
1082     uint8_t halfHV[64];\
1083     copy_block9(full, src, 16, stride, 9);\
1084     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1085     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1086     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1087     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1088 }\
1089 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1090     uint8_t full[16*9];\
1091     uint8_t halfH[72];\
1092     copy_block9(full, src, 16, stride, 9);\
1093     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1094     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1095     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1096 }\
1097 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1098     uint8_t halfH[72];\
1099     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1100     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1101 }\
1102 \
1103 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1104     uint8_t half[256];\
1105     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1106     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1107 }\
1108 \
1109 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1110     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1111 }\
1112 \
1113 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1114     uint8_t half[256];\
1115     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1116     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1117 }\
1118 \
1119 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1120     uint8_t full[24*17];\
1121     uint8_t half[256];\
1122     copy_block17(full, src, 24, stride, 17);\
1123     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1124     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1125 }\
1126 \
1127 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1128     uint8_t full[24*17];\
1129     copy_block17(full, src, 24, stride, 17);\
1130     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1131 }\
1132 \
1133 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1134     uint8_t full[24*17];\
1135     uint8_t half[256];\
1136     copy_block17(full, src, 24, stride, 17);\
1137     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1138     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1139 }\
1140 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1141     uint8_t full[24*17];\
1142     uint8_t halfH[272];\
1143     uint8_t halfV[256];\
1144     uint8_t halfHV[256];\
1145     copy_block17(full, src, 24, stride, 17);\
1146     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1147     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1148     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1149     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1150 }\
1151 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1152     uint8_t full[24*17];\
1153     uint8_t halfH[272];\
1154     uint8_t halfHV[256];\
1155     copy_block17(full, src, 24, stride, 17);\
1156     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1157     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1158     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1159     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1160 }\
1161 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1162     uint8_t full[24*17];\
1163     uint8_t halfH[272];\
1164     uint8_t halfV[256];\
1165     uint8_t halfHV[256];\
1166     copy_block17(full, src, 24, stride, 17);\
1167     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1168     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1169     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1170     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1171 }\
1172 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1173     uint8_t full[24*17];\
1174     uint8_t halfH[272];\
1175     uint8_t halfHV[256];\
1176     copy_block17(full, src, 24, stride, 17);\
1177     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1178     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1179     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1180     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1181 }\
1182 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1183     uint8_t full[24*17];\
1184     uint8_t halfH[272];\
1185     uint8_t halfV[256];\
1186     uint8_t halfHV[256];\
1187     copy_block17(full, src, 24, stride, 17);\
1188     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1189     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1190     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1191     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1192 }\
1193 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1194     uint8_t full[24*17];\
1195     uint8_t halfH[272];\
1196     uint8_t halfHV[256];\
1197     copy_block17(full, src, 24, stride, 17);\
1198     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1199     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1200     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1201     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1202 }\
1203 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1204     uint8_t full[24*17];\
1205     uint8_t halfH[272];\
1206     uint8_t halfV[256];\
1207     uint8_t halfHV[256];\
1208     copy_block17(full, src, 24, stride, 17);\
1209     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1210     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1211     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1212     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1213 }\
1214 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1215     uint8_t full[24*17];\
1216     uint8_t halfH[272];\
1217     uint8_t halfHV[256];\
1218     copy_block17(full, src, 24, stride, 17);\
1219     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1220     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1221     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1222     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1223 }\
1224 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1225     uint8_t halfH[272];\
1226     uint8_t halfHV[256];\
1227     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1228     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1229     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1230 }\
1231 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1232     uint8_t halfH[272];\
1233     uint8_t halfHV[256];\
1234     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1235     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1236     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1237 }\
1238 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1239     uint8_t full[24*17];\
1240     uint8_t halfH[272];\
1241     uint8_t halfV[256];\
1242     uint8_t halfHV[256];\
1243     copy_block17(full, src, 24, stride, 17);\
1244     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1245     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1246     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1247     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1248 }\
1249 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1250     uint8_t full[24*17];\
1251     uint8_t halfH[272];\
1252     copy_block17(full, src, 24, stride, 17);\
1253     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1254     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1255     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1256 }\
1257 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1258     uint8_t full[24*17];\
1259     uint8_t halfH[272];\
1260     uint8_t halfV[256];\
1261     uint8_t halfHV[256];\
1262     copy_block17(full, src, 24, stride, 17);\
1263     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1264     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1265     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1266     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1267 }\
1268 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1269     uint8_t full[24*17];\
1270     uint8_t halfH[272];\
1271     copy_block17(full, src, 24, stride, 17);\
1272     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1273     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1274     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1275 }\
1276 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1277     uint8_t halfH[272];\
1278     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1279     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1280 }
1281
1282 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1283 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1284 #define op_put(a, b) a = cm[((b) + 16)>>5]
1285 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1286
1287 QPEL_MC(0, put_       , _       , op_put)
1288 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1289 QPEL_MC(0, avg_       , _       , op_avg)
1290 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1291 #undef op_avg
1292 #undef op_avg_no_rnd
1293 #undef op_put
1294 #undef op_put_no_rnd
1295
1296 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1297 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1298 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1299 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1300 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1301 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1302
1303 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1304     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1305     int i;
1306
1307     for(i=0; i<h; i++){
1308         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1309         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1310         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1311         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1312         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1313         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1314         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1315         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1316         dst+=dstStride;
1317         src+=srcStride;
1318     }
1319 }
1320
1321 #if CONFIG_RV40_DECODER
1322 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1323     put_pixels16_xy2_8_c(dst, src, stride, 16);
1324 }
1325 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1326     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1327 }
1328 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1329     put_pixels8_xy2_8_c(dst, src, stride, 8);
1330 }
1331 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1332     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1333 }
1334 #endif /* CONFIG_RV40_DECODER */
1335
1336 #if CONFIG_DIRAC_DECODER
1337 #define DIRAC_MC(OPNAME)\
1338 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1339 {\
1340      OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1341 }\
1342 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1343 {\
1344     OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1345 }\
1346 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1347 {\
1348     OPNAME ## _pixels16_8_c(dst   , src[0]   , stride, h);\
1349     OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1350 }\
1351 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1352 {\
1353     OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1354 }\
1355 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1356 {\
1357     OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1358 }\
1359 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1360 {\
1361     OPNAME ## _pixels16_l2_8(dst   , src[0]   , src[1]   , stride, stride, stride, h);\
1362     OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1363 }\
1364 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1365 {\
1366     OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1367 }\
1368 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1369 {\
1370     OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1371 }\
1372 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1373 {\
1374     OPNAME ## _pixels16_l4_8(dst   , src[0]   , src[1]   , src[2]   , src[3]   , stride, stride, stride, stride, stride, h);\
1375     OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1376 }
1377 DIRAC_MC(put)
1378 DIRAC_MC(avg)
1379 #endif
1380
1381 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1382     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1383     int i;
1384
1385     for(i=0; i<w; i++){
1386         const int src_1= src[ -srcStride];
1387         const int src0 = src[0          ];
1388         const int src1 = src[  srcStride];
1389         const int src2 = src[2*srcStride];
1390         const int src3 = src[3*srcStride];
1391         const int src4 = src[4*srcStride];
1392         const int src5 = src[5*srcStride];
1393         const int src6 = src[6*srcStride];
1394         const int src7 = src[7*srcStride];
1395         const int src8 = src[8*srcStride];
1396         const int src9 = src[9*srcStride];
1397         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1398         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1399         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1400         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1401         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1402         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1403         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1404         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1405         src++;
1406         dst++;
1407     }
1408 }
1409
1410 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1411     uint8_t half[64];
1412     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1413     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1414 }
1415
1416 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1417     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1418 }
1419
1420 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1421     uint8_t half[64];
1422     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1423     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1424 }
1425
1426 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1427     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1428 }
1429
1430 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1431     uint8_t halfH[88];
1432     uint8_t halfV[64];
1433     uint8_t halfHV[64];
1434     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1435     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1436     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1437     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1438 }
1439 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1440     uint8_t halfH[88];
1441     uint8_t halfV[64];
1442     uint8_t halfHV[64];
1443     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1444     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1445     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1446     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1447 }
1448 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1449     uint8_t halfH[88];
1450     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1451     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1452 }
1453
1454 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1455     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1456     int x;
1457     const int strength= ff_h263_loop_filter_strength[qscale];
1458
1459     for(x=0; x<8; x++){
1460         int d1, d2, ad1;
1461         int p0= src[x-2*stride];
1462         int p1= src[x-1*stride];
1463         int p2= src[x+0*stride];
1464         int p3= src[x+1*stride];
1465         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1466
1467         if     (d<-2*strength) d1= 0;
1468         else if(d<-  strength) d1=-2*strength - d;
1469         else if(d<   strength) d1= d;
1470         else if(d< 2*strength) d1= 2*strength - d;
1471         else                   d1= 0;
1472
1473         p1 += d1;
1474         p2 -= d1;
1475         if(p1&256) p1= ~(p1>>31);
1476         if(p2&256) p2= ~(p2>>31);
1477
1478         src[x-1*stride] = p1;
1479         src[x+0*stride] = p2;
1480
1481         ad1= FFABS(d1)>>1;
1482
1483         d2= av_clip((p0-p3)/4, -ad1, ad1);
1484
1485         src[x-2*stride] = p0 - d2;
1486         src[x+  stride] = p3 + d2;
1487     }
1488     }
1489 }
1490
1491 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1492     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1493     int y;
1494     const int strength= ff_h263_loop_filter_strength[qscale];
1495
1496     for(y=0; y<8; y++){
1497         int d1, d2, ad1;
1498         int p0= src[y*stride-2];
1499         int p1= src[y*stride-1];
1500         int p2= src[y*stride+0];
1501         int p3= src[y*stride+1];
1502         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1503
1504         if     (d<-2*strength) d1= 0;
1505         else if(d<-  strength) d1=-2*strength - d;
1506         else if(d<   strength) d1= d;
1507         else if(d< 2*strength) d1= 2*strength - d;
1508         else                   d1= 0;
1509
1510         p1 += d1;
1511         p2 -= d1;
1512         if(p1&256) p1= ~(p1>>31);
1513         if(p2&256) p2= ~(p2>>31);
1514
1515         src[y*stride-1] = p1;
1516         src[y*stride+0] = p2;
1517
1518         ad1= FFABS(d1)>>1;
1519
1520         d2= av_clip((p0-p3)/4, -ad1, ad1);
1521
1522         src[y*stride-2] = p0 - d2;
1523         src[y*stride+1] = p3 + d2;
1524     }
1525     }
1526 }
1527
1528 static void h261_loop_filter_c(uint8_t *src, int stride){
1529     int x,y,xy,yz;
1530     int temp[64];
1531
1532     for(x=0; x<8; x++){
1533         temp[x      ] = 4*src[x           ];
1534         temp[x + 7*8] = 4*src[x + 7*stride];
1535     }
1536     for(y=1; y<7; y++){
1537         for(x=0; x<8; x++){
1538             xy = y * stride + x;
1539             yz = y * 8 + x;
1540             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1541         }
1542     }
1543
1544     for(y=0; y<8; y++){
1545         src[  y*stride] = (temp[  y*8] + 2)>>2;
1546         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1547         for(x=1; x<7; x++){
1548             xy = y * stride + x;
1549             yz = y * 8 + x;
1550             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1551         }
1552     }
1553 }
1554
1555 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1556 {
1557     int s, i;
1558
1559     s = 0;
1560     for(i=0;i<h;i++) {
1561         s += abs(pix1[0] - pix2[0]);
1562         s += abs(pix1[1] - pix2[1]);
1563         s += abs(pix1[2] - pix2[2]);
1564         s += abs(pix1[3] - pix2[3]);
1565         s += abs(pix1[4] - pix2[4]);
1566         s += abs(pix1[5] - pix2[5]);
1567         s += abs(pix1[6] - pix2[6]);
1568         s += abs(pix1[7] - pix2[7]);
1569         s += abs(pix1[8] - pix2[8]);
1570         s += abs(pix1[9] - pix2[9]);
1571         s += abs(pix1[10] - pix2[10]);
1572         s += abs(pix1[11] - pix2[11]);
1573         s += abs(pix1[12] - pix2[12]);
1574         s += abs(pix1[13] - pix2[13]);
1575         s += abs(pix1[14] - pix2[14]);
1576         s += abs(pix1[15] - pix2[15]);
1577         pix1 += line_size;
1578         pix2 += line_size;
1579     }
1580     return s;
1581 }
1582
1583 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1584 {
1585     int s, i;
1586
1587     s = 0;
1588     for(i=0;i<h;i++) {
1589         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1590         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1591         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1592         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1593         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1594         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1595         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1596         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1597         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1598         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1599         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1600         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1601         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1602         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1603         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1604         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1605         pix1 += line_size;
1606         pix2 += line_size;
1607     }
1608     return s;
1609 }
1610
1611 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1612 {
1613     int s, i;
1614     uint8_t *pix3 = pix2 + line_size;
1615
1616     s = 0;
1617     for(i=0;i<h;i++) {
1618         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1619         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1620         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1621         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1622         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1623         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1624         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1625         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1626         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1627         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1628         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1629         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1630         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1631         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1632         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1633         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1634         pix1 += line_size;
1635         pix2 += line_size;
1636         pix3 += line_size;
1637     }
1638     return s;
1639 }
1640
1641 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1642 {
1643     int s, i;
1644     uint8_t *pix3 = pix2 + line_size;
1645
1646     s = 0;
1647     for(i=0;i<h;i++) {
1648         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1649         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1650         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1651         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1652         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1653         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1654         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1655         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1656         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1657         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1658         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1659         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1660         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1661         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1662         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1663         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1664         pix1 += line_size;
1665         pix2 += line_size;
1666         pix3 += line_size;
1667     }
1668     return s;
1669 }
1670
1671 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1672 {
1673     int s, i;
1674
1675     s = 0;
1676     for(i=0;i<h;i++) {
1677         s += abs(pix1[0] - pix2[0]);
1678         s += abs(pix1[1] - pix2[1]);
1679         s += abs(pix1[2] - pix2[2]);
1680         s += abs(pix1[3] - pix2[3]);
1681         s += abs(pix1[4] - pix2[4]);
1682         s += abs(pix1[5] - pix2[5]);
1683         s += abs(pix1[6] - pix2[6]);
1684         s += abs(pix1[7] - pix2[7]);
1685         pix1 += line_size;
1686         pix2 += line_size;
1687     }
1688     return s;
1689 }
1690
1691 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1692 {
1693     int s, i;
1694
1695     s = 0;
1696     for(i=0;i<h;i++) {
1697         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1698         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1699         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1700         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1701         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1702         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1703         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1704         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1705         pix1 += line_size;
1706         pix2 += line_size;
1707     }
1708     return s;
1709 }
1710
1711 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1712 {
1713     int s, i;
1714     uint8_t *pix3 = pix2 + line_size;
1715
1716     s = 0;
1717     for(i=0;i<h;i++) {
1718         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1719         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1720         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1721         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1722         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1723         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1724         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1725         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1726         pix1 += line_size;
1727         pix2 += line_size;
1728         pix3 += line_size;
1729     }
1730     return s;
1731 }
1732
1733 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1734 {
1735     int s, i;
1736     uint8_t *pix3 = pix2 + line_size;
1737
1738     s = 0;
1739     for(i=0;i<h;i++) {
1740         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1741         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1742         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1743         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1744         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1745         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1746         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1747         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1748         pix1 += line_size;
1749         pix2 += line_size;
1750         pix3 += line_size;
1751     }
1752     return s;
1753 }
1754
1755 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1756     MpegEncContext *c = v;
1757     int score1=0;
1758     int score2=0;
1759     int x,y;
1760
1761     for(y=0; y<h; y++){
1762         for(x=0; x<16; x++){
1763             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1764         }
1765         if(y+1<h){
1766             for(x=0; x<15; x++){
1767                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1768                              - s1[x+1] + s1[x+1+stride])
1769                         -FFABS(  s2[x  ] - s2[x  +stride]
1770                              - s2[x+1] + s2[x+1+stride]);
1771             }
1772         }
1773         s1+= stride;
1774         s2+= stride;
1775     }
1776
1777     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1778     else  return score1 + FFABS(score2)*8;
1779 }
1780
1781 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1782     MpegEncContext *c = v;
1783     int score1=0;
1784     int score2=0;
1785     int x,y;
1786
1787     for(y=0; y<h; y++){
1788         for(x=0; x<8; x++){
1789             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1790         }
1791         if(y+1<h){
1792             for(x=0; x<7; x++){
1793                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1794                              - s1[x+1] + s1[x+1+stride])
1795                         -FFABS(  s2[x  ] - s2[x  +stride]
1796                              - s2[x+1] + s2[x+1+stride]);
1797             }
1798         }
1799         s1+= stride;
1800         s2+= stride;
1801     }
1802
1803     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1804     else  return score1 + FFABS(score2)*8;
1805 }
1806
1807 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1808     int i;
1809     unsigned int sum=0;
1810
1811     for(i=0; i<8*8; i++){
1812         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1813         int w= weight[i];
1814         b>>= RECON_SHIFT;
1815         av_assert2(-512<b && b<512);
1816
1817         sum += (w*b)*(w*b)>>4;
1818     }
1819     return sum>>2;
1820 }
1821
1822 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1823     int i;
1824
1825     for(i=0; i<8*8; i++){
1826         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1827     }
1828 }
1829
1830 /**
1831  * Permute an 8x8 block.
1832  * @param block the block which will be permuted according to the given permutation vector
1833  * @param permutation the permutation vector
1834  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1835  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1836  *                  (inverse) permutated to scantable order!
1837  */
1838 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1839 {
1840     int i;
1841     DCTELEM temp[64];
1842
1843     if(last<=0) return;
1844     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1845
1846     for(i=0; i<=last; i++){
1847         const int j= scantable[i];
1848         temp[j]= block[j];
1849         block[j]=0;
1850     }
1851
1852     for(i=0; i<=last; i++){
1853         const int j= scantable[i];
1854         const int perm_j= permutation[j];
1855         block[perm_j]= temp[j];
1856     }
1857 }
1858
1859 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1860     return 0;
1861 }
1862
1863 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1864     int i;
1865
1866     memset(cmp, 0, sizeof(void*)*6);
1867
1868     for(i=0; i<6; i++){
1869         switch(type&0xFF){
1870         case FF_CMP_SAD:
1871             cmp[i]= c->sad[i];
1872             break;
1873         case FF_CMP_SATD:
1874             cmp[i]= c->hadamard8_diff[i];
1875             break;
1876         case FF_CMP_SSE:
1877             cmp[i]= c->sse[i];
1878             break;
1879         case FF_CMP_DCT:
1880             cmp[i]= c->dct_sad[i];
1881             break;
1882         case FF_CMP_DCT264:
1883             cmp[i]= c->dct264_sad[i];
1884             break;
1885         case FF_CMP_DCTMAX:
1886             cmp[i]= c->dct_max[i];
1887             break;
1888         case FF_CMP_PSNR:
1889             cmp[i]= c->quant_psnr[i];
1890             break;
1891         case FF_CMP_BIT:
1892             cmp[i]= c->bit[i];
1893             break;
1894         case FF_CMP_RD:
1895             cmp[i]= c->rd[i];
1896             break;
1897         case FF_CMP_VSAD:
1898             cmp[i]= c->vsad[i];
1899             break;
1900         case FF_CMP_VSSE:
1901             cmp[i]= c->vsse[i];
1902             break;
1903         case FF_CMP_ZERO:
1904             cmp[i]= zero_cmp;
1905             break;
1906         case FF_CMP_NSSE:
1907             cmp[i]= c->nsse[i];
1908             break;
1909 #if CONFIG_DWT
1910         case FF_CMP_W53:
1911             cmp[i]= c->w53[i];
1912             break;
1913         case FF_CMP_W97:
1914             cmp[i]= c->w97[i];
1915             break;
1916 #endif
1917         default:
1918             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1919         }
1920     }
1921 }
1922
1923 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1924     long i;
1925     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1926         long a = *(long*)(src+i);
1927         long b = *(long*)(dst+i);
1928         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1929     }
1930     for(; i<w; i++)
1931         dst[i+0] += src[i+0];
1932 }
1933
1934 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1935     long i;
1936 #if !HAVE_FAST_UNALIGNED
1937     if((long)src2 & (sizeof(long)-1)){
1938         for(i=0; i+7<w; i+=8){
1939             dst[i+0] = src1[i+0]-src2[i+0];
1940             dst[i+1] = src1[i+1]-src2[i+1];
1941             dst[i+2] = src1[i+2]-src2[i+2];
1942             dst[i+3] = src1[i+3]-src2[i+3];
1943             dst[i+4] = src1[i+4]-src2[i+4];
1944             dst[i+5] = src1[i+5]-src2[i+5];
1945             dst[i+6] = src1[i+6]-src2[i+6];
1946             dst[i+7] = src1[i+7]-src2[i+7];
1947         }
1948     }else
1949 #endif
1950     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1951         long a = *(long*)(src1+i);
1952         long b = *(long*)(src2+i);
1953         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1954     }
1955     for(; i<w; i++)
1956         dst[i+0] = src1[i+0]-src2[i+0];
1957 }
1958
1959 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1960     int i;
1961     uint8_t l, lt;
1962
1963     l= *left;
1964     lt= *left_top;
1965
1966     for(i=0; i<w; i++){
1967         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1968         lt= src1[i];
1969         dst[i]= l;
1970     }
1971
1972     *left= l;
1973     *left_top= lt;
1974 }
1975
1976 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1977     int i;
1978     uint8_t l, lt;
1979
1980     l= *left;
1981     lt= *left_top;
1982
1983     for(i=0; i<w; i++){
1984         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1985         lt= src1[i];
1986         l= src2[i];
1987         dst[i]= l - pred;
1988     }
1989
1990     *left= l;
1991     *left_top= lt;
1992 }
1993
1994 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1995     int i;
1996
1997     for(i=0; i<w-1; i++){
1998         acc+= src[i];
1999         dst[i]= acc;
2000         i++;
2001         acc+= src[i];
2002         dst[i]= acc;
2003     }
2004
2005     for(; i<w; i++){
2006         acc+= src[i];
2007         dst[i]= acc;
2008     }
2009
2010     return acc;
2011 }
2012
2013 #if HAVE_BIGENDIAN
2014 #define B 3
2015 #define G 2
2016 #define R 1
2017 #define A 0
2018 #else
2019 #define B 0
2020 #define G 1
2021 #define R 2
2022 #define A 3
2023 #endif
2024 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2025     int i;
2026     int r,g,b,a;
2027     r= *red;
2028     g= *green;
2029     b= *blue;
2030     a= *alpha;
2031
2032     for(i=0; i<w; i++){
2033         b+= src[4*i+B];
2034         g+= src[4*i+G];
2035         r+= src[4*i+R];
2036         a+= src[4*i+A];
2037
2038         dst[4*i+B]= b;
2039         dst[4*i+G]= g;
2040         dst[4*i+R]= r;
2041         dst[4*i+A]= a;
2042     }
2043
2044     *red= r;
2045     *green= g;
2046     *blue= b;
2047     *alpha= a;
2048 }
2049 #undef B
2050 #undef G
2051 #undef R
2052 #undef A
2053
2054 #define BUTTERFLY2(o1,o2,i1,i2) \
2055 o1= (i1)+(i2);\
2056 o2= (i1)-(i2);
2057
2058 #define BUTTERFLY1(x,y) \
2059 {\
2060     int a,b;\
2061     a= x;\
2062     b= y;\
2063     x= a+b;\
2064     y= a-b;\
2065 }
2066
2067 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2068
2069 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2070     int i;
2071     int temp[64];
2072     int sum=0;
2073
2074     av_assert2(h==8);
2075
2076     for(i=0; i<8; i++){
2077         //FIXME try pointer walks
2078         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2079         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2080         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2081         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2082
2083         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2084         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2085         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2086         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2087
2088         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2089         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2090         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2091         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2092     }
2093
2094     for(i=0; i<8; i++){
2095         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2096         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2097         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2098         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2099
2100         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2101         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2102         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2103         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2104
2105         sum +=
2106              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2107             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2108             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2109             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2110     }
2111     return sum;
2112 }
2113
2114 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2115     int i;
2116     int temp[64];
2117     int sum=0;
2118
2119     av_assert2(h==8);
2120
2121     for(i=0; i<8; i++){
2122         //FIXME try pointer walks
2123         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2124         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2125         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2126         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2127
2128         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2129         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2130         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2131         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2132
2133         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2134         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2135         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2136         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2137     }
2138
2139     for(i=0; i<8; i++){
2140         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2141         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2142         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2143         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2144
2145         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2146         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2147         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2148         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2149
2150         sum +=
2151              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2152             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2153             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2154             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2155     }
2156
2157     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2158
2159     return sum;
2160 }
2161
2162 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2163     MpegEncContext * const s= (MpegEncContext *)c;
2164     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2165
2166     av_assert2(h==8);
2167
2168     s->dsp.diff_pixels(temp, src1, src2, stride);
2169     s->dsp.fdct(temp);
2170     return s->dsp.sum_abs_dctelem(temp);
2171 }
2172
2173 #if CONFIG_GPL
2174 #define DCT8_1D {\
2175     const int s07 = SRC(0) + SRC(7);\
2176     const int s16 = SRC(1) + SRC(6);\
2177     const int s25 = SRC(2) + SRC(5);\
2178     const int s34 = SRC(3) + SRC(4);\
2179     const int a0 = s07 + s34;\
2180     const int a1 = s16 + s25;\
2181     const int a2 = s07 - s34;\
2182     const int a3 = s16 - s25;\
2183     const int d07 = SRC(0) - SRC(7);\
2184     const int d16 = SRC(1) - SRC(6);\
2185     const int d25 = SRC(2) - SRC(5);\
2186     const int d34 = SRC(3) - SRC(4);\
2187     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2188     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2189     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2190     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2191     DST(0,  a0 + a1     ) ;\
2192     DST(1,  a4 + (a7>>2)) ;\
2193     DST(2,  a2 + (a3>>1)) ;\
2194     DST(3,  a5 + (a6>>2)) ;\
2195     DST(4,  a0 - a1     ) ;\
2196     DST(5,  a6 - (a5>>2)) ;\
2197     DST(6, (a2>>1) - a3 ) ;\
2198     DST(7, (a4>>2) - a7 ) ;\
2199 }
2200
2201 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2202     MpegEncContext * const s= (MpegEncContext *)c;
2203     DCTELEM dct[8][8];
2204     int i;
2205     int sum=0;
2206
2207     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2208
2209 #define SRC(x) dct[i][x]
2210 #define DST(x,v) dct[i][x]= v
2211     for( i = 0; i < 8; i++ )
2212         DCT8_1D
2213 #undef SRC
2214 #undef DST
2215
2216 #define SRC(x) dct[x][i]
2217 #define DST(x,v) sum += FFABS(v)
2218     for( i = 0; i < 8; i++ )
2219         DCT8_1D
2220 #undef SRC
2221 #undef DST
2222     return sum;
2223 }
2224 #endif
2225
2226 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2227     MpegEncContext * const s= (MpegEncContext *)c;
2228     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2229     int sum=0, i;
2230
2231     av_assert2(h==8);
2232
2233     s->dsp.diff_pixels(temp, src1, src2, stride);
2234     s->dsp.fdct(temp);
2235
2236     for(i=0; i<64; i++)
2237         sum= FFMAX(sum, FFABS(temp[i]));
2238
2239     return sum;
2240 }
2241
2242 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2243     MpegEncContext * const s= (MpegEncContext *)c;
2244     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2245     DCTELEM * const bak = temp+64;
2246     int sum=0, i;
2247
2248     av_assert2(h==8);
2249     s->mb_intra=0;
2250
2251     s->dsp.diff_pixels(temp, src1, src2, stride);
2252
2253     memcpy(bak, temp, 64*sizeof(DCTELEM));
2254
2255     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2256     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2257     ff_simple_idct_8(temp); //FIXME
2258
2259     for(i=0; i<64; i++)
2260         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2261
2262     return sum;
2263 }
2264
2265 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2266     MpegEncContext * const s= (MpegEncContext *)c;
2267     const uint8_t *scantable= s->intra_scantable.permutated;
2268     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2269     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2270     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2271     int i, last, run, bits, level, distortion, start_i;
2272     const int esc_length= s->ac_esc_length;
2273     uint8_t * length;
2274     uint8_t * last_length;
2275
2276     av_assert2(h==8);
2277
2278     copy_block8(lsrc1, src1, 8, stride, 8);
2279     copy_block8(lsrc2, src2, 8, stride, 8);
2280
2281     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2282
2283     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2284
2285     bits=0;
2286
2287     if (s->mb_intra) {
2288         start_i = 1;
2289         length     = s->intra_ac_vlc_length;
2290         last_length= s->intra_ac_vlc_last_length;
2291         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2292     } else {
2293         start_i = 0;
2294         length     = s->inter_ac_vlc_length;
2295         last_length= s->inter_ac_vlc_last_length;
2296     }
2297
2298     if(last>=start_i){
2299         run=0;
2300         for(i=start_i; i<last; i++){
2301             int j= scantable[i];
2302             level= temp[j];
2303
2304             if(level){
2305                 level+=64;
2306                 if((level&(~127)) == 0){
2307                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2308                 }else
2309                     bits+= esc_length;
2310                 run=0;
2311             }else
2312                 run++;
2313         }
2314         i= scantable[last];
2315
2316         level= temp[i] + 64;
2317
2318         av_assert2(level - 64);
2319
2320         if((level&(~127)) == 0){
2321             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2322         }else
2323             bits+= esc_length;
2324
2325     }
2326
2327     if(last>=0){
2328         if(s->mb_intra)
2329             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2330         else
2331             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2332     }
2333
2334     s->dsp.idct_add(lsrc2, 8, temp);
2335
2336     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2337
2338     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2339 }
2340
2341 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2342     MpegEncContext * const s= (MpegEncContext *)c;
2343     const uint8_t *scantable= s->intra_scantable.permutated;
2344     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2345     int i, last, run, bits, level, start_i;
2346     const int esc_length= s->ac_esc_length;
2347     uint8_t * length;
2348     uint8_t * last_length;
2349
2350     av_assert2(h==8);
2351
2352     s->dsp.diff_pixels(temp, src1, src2, stride);
2353
2354     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2355
2356     bits=0;
2357
2358     if (s->mb_intra) {
2359         start_i = 1;
2360         length     = s->intra_ac_vlc_length;
2361         last_length= s->intra_ac_vlc_last_length;
2362         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2363     } else {
2364         start_i = 0;
2365         length     = s->inter_ac_vlc_length;
2366         last_length= s->inter_ac_vlc_last_length;
2367     }
2368
2369     if(last>=start_i){
2370         run=0;
2371         for(i=start_i; i<last; i++){
2372             int j= scantable[i];
2373             level= temp[j];
2374
2375             if(level){
2376                 level+=64;
2377                 if((level&(~127)) == 0){
2378                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2379                 }else
2380                     bits+= esc_length;
2381                 run=0;
2382             }else
2383                 run++;
2384         }
2385         i= scantable[last];
2386
2387         level= temp[i] + 64;
2388
2389         av_assert2(level - 64);
2390
2391         if((level&(~127)) == 0){
2392             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2393         }else
2394             bits+= esc_length;
2395     }
2396
2397     return bits;
2398 }
2399
2400 #define VSAD_INTRA(size) \
2401 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2402     int score=0;                                                                                            \
2403     int x,y;                                                                                                \
2404                                                                                                             \
2405     for(y=1; y<h; y++){                                                                                     \
2406         for(x=0; x<size; x+=4){                                                                             \
2407             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2408                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2409         }                                                                                                   \
2410         s+= stride;                                                                                         \
2411     }                                                                                                       \
2412                                                                                                             \
2413     return score;                                                                                           \
2414 }
2415 VSAD_INTRA(8)
2416 VSAD_INTRA(16)
2417
2418 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2419     int score=0;
2420     int x,y;
2421
2422     for(y=1; y<h; y++){
2423         for(x=0; x<16; x++){
2424             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2425         }
2426         s1+= stride;
2427         s2+= stride;
2428     }
2429
2430     return score;
2431 }
2432
2433 #define SQ(a) ((a)*(a))
2434 #define VSSE_INTRA(size) \
2435 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2436     int score=0;                                                                                            \
2437     int x,y;                                                                                                \
2438                                                                                                             \
2439     for(y=1; y<h; y++){                                                                                     \
2440         for(x=0; x<size; x+=4){                                                                               \
2441             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2442                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2443         }                                                                                                   \
2444         s+= stride;                                                                                         \
2445     }                                                                                                       \
2446                                                                                                             \
2447     return score;                                                                                           \
2448 }
2449 VSSE_INTRA(8)
2450 VSSE_INTRA(16)
2451
2452 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2453     int score=0;
2454     int x,y;
2455
2456     for(y=1; y<h; y++){
2457         for(x=0; x<16; x++){
2458             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2459         }
2460         s1+= stride;
2461         s2+= stride;
2462     }
2463
2464     return score;
2465 }
2466
2467 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2468                                int size){
2469     int score=0;
2470     int i;
2471     for(i=0; i<size; i++)
2472         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2473     return score;
2474 }
2475
2476 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2477 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2478 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2479 #if CONFIG_GPL
2480 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2481 #endif
2482 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2483 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2484 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2485 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2486
2487 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2488     int i;
2489     src1 += len-1;
2490     for(i=0; i<len; i++)
2491         dst[i] = src0[i] * src1[-i];
2492 }
2493
2494 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2495     int i;
2496     for(i=0; i<len; i++)
2497         dst[i] = src0[i] * src1[i] + src2[i];
2498 }
2499
2500 static void vector_fmul_window_c(float *dst, const float *src0,
2501                                  const float *src1, const float *win, int len)
2502 {
2503     int i,j;
2504     dst += len;
2505     win += len;
2506     src0+= len;
2507     for(i=-len, j=len-1; i<0; i++, j--) {
2508         float s0 = src0[i];
2509         float s1 = src1[j];
2510         float wi = win[i];
2511         float wj = win[j];
2512         dst[i] = s0*wj - s1*wi;
2513         dst[j] = s0*wi + s1*wj;
2514     }
2515 }
2516
2517 static void butterflies_float_c(float *av_restrict v1, float *av_restrict v2,
2518                                 int len)
2519 {
2520     int i;
2521     for (i = 0; i < len; i++) {
2522         float t = v1[i] - v2[i];
2523         v1[i] += v2[i];
2524         v2[i] = t;
2525     }
2526 }
2527
2528 static void butterflies_float_interleave_c(float *dst, const float *src0,
2529                                            const float *src1, int len)
2530 {
2531     int i;
2532     for (i = 0; i < len; i++) {
2533         float f1 = src0[i];
2534         float f2 = src1[i];
2535         dst[2*i    ] = f1 + f2;
2536         dst[2*i + 1] = f1 - f2;
2537     }
2538 }
2539
2540 float ff_scalarproduct_float_c(const float *v1, const float *v2, int len)
2541 {
2542     float p = 0.0;
2543     int i;
2544
2545     for (i = 0; i < len; i++)
2546         p += v1[i] * v2[i];
2547
2548     return p;
2549 }
2550
2551 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2552                    uint32_t maxi, uint32_t maxisign)
2553 {
2554
2555     if(a > mini) return mini;
2556     else if((a^(1U<<31)) > maxisign) return maxi;
2557     else return a;
2558 }
2559
2560 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2561     int i;
2562     uint32_t mini = *(uint32_t*)min;
2563     uint32_t maxi = *(uint32_t*)max;
2564     uint32_t maxisign = maxi ^ (1U<<31);
2565     uint32_t *dsti = (uint32_t*)dst;
2566     const uint32_t *srci = (const uint32_t*)src;
2567     for(i=0; i<len; i+=8) {
2568         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2569         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2570         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2571         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2572         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2573         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2574         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2575         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2576     }
2577 }
2578 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2579     int i;
2580     if(min < 0 && max > 0) {
2581         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2582     } else {
2583         for(i=0; i < len; i+=8) {
2584             dst[i    ] = av_clipf(src[i    ], min, max);
2585             dst[i + 1] = av_clipf(src[i + 1], min, max);
2586             dst[i + 2] = av_clipf(src[i + 2], min, max);
2587             dst[i + 3] = av_clipf(src[i + 3], min, max);
2588             dst[i + 4] = av_clipf(src[i + 4], min, max);
2589             dst[i + 5] = av_clipf(src[i + 5], min, max);
2590             dst[i + 6] = av_clipf(src[i + 6], min, max);
2591             dst[i + 7] = av_clipf(src[i + 7], min, max);
2592         }
2593     }
2594 }
2595
2596 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2597 {
2598     int res = 0;
2599
2600     while (order--)
2601         res += *v1++ * *v2++;
2602
2603     return res;
2604 }
2605
2606 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2607 {
2608     int res = 0;
2609     while (order--) {
2610         res   += *v1 * *v2++;
2611         *v1++ += mul * *v3++;
2612     }
2613     return res;
2614 }
2615
2616 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2617                                  const int16_t *window, unsigned int len)
2618 {
2619     int i;
2620     int len2 = len >> 1;
2621
2622     for (i = 0; i < len2; i++) {
2623         int16_t w       = window[i];
2624         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2625         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2626     }
2627 }
2628
2629 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2630                                 int32_t max, unsigned int len)
2631 {
2632     do {
2633         *dst++ = av_clip(*src++, min, max);
2634         *dst++ = av_clip(*src++, min, max);
2635         *dst++ = av_clip(*src++, min, max);
2636         *dst++ = av_clip(*src++, min, max);
2637         *dst++ = av_clip(*src++, min, max);
2638         *dst++ = av_clip(*src++, min, max);
2639         *dst++ = av_clip(*src++, min, max);
2640         *dst++ = av_clip(*src++, min, max);
2641         len -= 8;
2642     } while (len > 0);
2643 }
2644
2645 #define W0 2048
2646 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2647 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2648 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2649 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2650 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2651 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2652 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2653
2654 static void wmv2_idct_row(short * b)
2655 {
2656     int s1,s2;
2657     int a0,a1,a2,a3,a4,a5,a6,a7;
2658     /*step 1*/
2659     a1 = W1*b[1]+W7*b[7];
2660     a7 = W7*b[1]-W1*b[7];
2661     a5 = W5*b[5]+W3*b[3];
2662     a3 = W3*b[5]-W5*b[3];
2663     a2 = W2*b[2]+W6*b[6];
2664     a6 = W6*b[2]-W2*b[6];
2665     a0 = W0*b[0]+W0*b[4];
2666     a4 = W0*b[0]-W0*b[4];
2667     /*step 2*/
2668     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2669     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2670     /*step 3*/
2671     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2672     b[1] = (a4+a6 +s1   + (1<<7))>>8;
2673     b[2] = (a4-a6 +s2   + (1<<7))>>8;
2674     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2675     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2676     b[5] = (a4-a6 -s2   + (1<<7))>>8;
2677     b[6] = (a4+a6 -s1   + (1<<7))>>8;
2678     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2679 }
2680 static void wmv2_idct_col(short * b)
2681 {
2682     int s1,s2;
2683     int a0,a1,a2,a3,a4,a5,a6,a7;
2684     /*step 1, with extended precision*/
2685     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2686     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2687     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2688     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2689     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2690     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2691     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2692     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2693     /*step 2*/
2694     s1 = (181*(a1-a5+a7-a3)+128)>>8;
2695     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2696     /*step 3*/
2697     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2698     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2699     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2700     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2701
2702     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2703     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2704     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2705     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2706 }
2707 void ff_wmv2_idct_c(short * block){
2708     int i;
2709
2710     for(i=0;i<64;i+=8){
2711         wmv2_idct_row(block+i);
2712     }
2713     for(i=0;i<8;i++){
2714         wmv2_idct_col(block+i);
2715     }
2716 }
2717 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2718  converted */
2719 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2720 {
2721     ff_wmv2_idct_c(block);
2722     put_pixels_clamped_c(block, dest, line_size);
2723 }
2724 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2725 {
2726     ff_wmv2_idct_c(block);
2727     add_pixels_clamped_c(block, dest, line_size);
2728 }
2729 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2730 {
2731     ff_j_rev_dct (block);
2732     put_pixels_clamped_c(block, dest, line_size);
2733 }
2734 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2735 {
2736     ff_j_rev_dct (block);
2737     add_pixels_clamped_c(block, dest, line_size);
2738 }
2739
2740 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2741 {
2742     ff_j_rev_dct4 (block);
2743     put_pixels_clamped4_c(block, dest, line_size);
2744 }
2745 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2746 {
2747     ff_j_rev_dct4 (block);
2748     add_pixels_clamped4_c(block, dest, line_size);
2749 }
2750
2751 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2752 {
2753     ff_j_rev_dct2 (block);
2754     put_pixels_clamped2_c(block, dest, line_size);
2755 }
2756 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2757 {
2758     ff_j_rev_dct2 (block);
2759     add_pixels_clamped2_c(block, dest, line_size);
2760 }
2761
2762 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2763 {
2764     dest[0] = av_clip_uint8((block[0] + 4)>>3);
2765 }
2766 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2767 {
2768     dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2769 }
2770
2771 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2772
2773 /* init static data */
2774 av_cold void ff_dsputil_static_init(void)
2775 {
2776     int i;
2777
2778     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2779     for(i=0;i<MAX_NEG_CROP;i++) {
2780         ff_cropTbl[i] = 0;
2781         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2782     }
2783
2784     for(i=0;i<512;i++) {
2785         ff_squareTbl[i] = (i - 256) * (i - 256);
2786     }
2787
2788     for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2789 }
2790
2791 int ff_check_alignment(void){
2792     static int did_fail=0;
2793     LOCAL_ALIGNED_16(int, aligned, [4]);
2794
2795     if((intptr_t)aligned & 15){
2796         if(!did_fail){
2797 #if HAVE_MMX || HAVE_ALTIVEC
2798             av_log(NULL, AV_LOG_ERROR,
2799                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2800                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2801                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2802                 "Do not report crashes to FFmpeg developers.\n");
2803 #endif
2804             did_fail=1;
2805         }
2806         return -1;
2807     }
2808     return 0;
2809 }
2810
2811 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2812 {
2813     int i, j;
2814
2815     ff_check_alignment();
2816
2817 #if CONFIG_ENCODERS
2818     if (avctx->bits_per_raw_sample == 10) {
2819         c->fdct    = ff_jpeg_fdct_islow_10;
2820         c->fdct248 = ff_fdct248_islow_10;
2821     } else {
2822         if(avctx->dct_algo==FF_DCT_FASTINT) {
2823             c->fdct    = ff_fdct_ifast;
2824             c->fdct248 = ff_fdct_ifast248;
2825         }
2826         else if(avctx->dct_algo==FF_DCT_FAAN) {
2827             c->fdct    = ff_faandct;
2828             c->fdct248 = ff_faandct248;
2829         }
2830         else {
2831             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2832             c->fdct248 = ff_fdct248_islow_8;
2833         }
2834     }
2835 #endif //CONFIG_ENCODERS
2836
2837     if(avctx->lowres==1){
2838         c->idct_put= ff_jref_idct4_put;
2839         c->idct_add= ff_jref_idct4_add;
2840         c->idct    = ff_j_rev_dct4;
2841         c->idct_permutation_type= FF_NO_IDCT_PERM;
2842     }else if(avctx->lowres==2){
2843         c->idct_put= ff_jref_idct2_put;
2844         c->idct_add= ff_jref_idct2_add;
2845         c->idct    = ff_j_rev_dct2;
2846         c->idct_permutation_type= FF_NO_IDCT_PERM;
2847     }else if(avctx->lowres==3){
2848         c->idct_put= ff_jref_idct1_put;
2849         c->idct_add= ff_jref_idct1_add;
2850         c->idct    = ff_j_rev_dct1;
2851         c->idct_permutation_type= FF_NO_IDCT_PERM;
2852     }else{
2853         if (avctx->bits_per_raw_sample == 10) {
2854             c->idct_put              = ff_simple_idct_put_10;
2855             c->idct_add              = ff_simple_idct_add_10;
2856             c->idct                  = ff_simple_idct_10;
2857             c->idct_permutation_type = FF_NO_IDCT_PERM;
2858         } else {
2859         if(avctx->idct_algo==FF_IDCT_INT){
2860             c->idct_put= ff_jref_idct_put;
2861             c->idct_add= ff_jref_idct_add;
2862             c->idct    = ff_j_rev_dct;
2863             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2864         }else if(avctx->idct_algo==FF_IDCT_WMV2){
2865             c->idct_put= ff_wmv2_idct_put_c;
2866             c->idct_add= ff_wmv2_idct_add_c;
2867             c->idct    = ff_wmv2_idct_c;
2868             c->idct_permutation_type= FF_NO_IDCT_PERM;
2869         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2870             c->idct_put= ff_faanidct_put;
2871             c->idct_add= ff_faanidct_add;
2872             c->idct    = ff_faanidct;
2873             c->idct_permutation_type= FF_NO_IDCT_PERM;
2874         }else{ //accurate/default
2875             c->idct_put = ff_simple_idct_put_8;
2876             c->idct_add = ff_simple_idct_add_8;
2877             c->idct     = ff_simple_idct_8;
2878             c->idct_permutation_type= FF_NO_IDCT_PERM;
2879         }
2880         }
2881     }
2882
2883     c->diff_pixels = diff_pixels_c;
2884     c->put_pixels_clamped = put_pixels_clamped_c;
2885     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2886     c->add_pixels_clamped = add_pixels_clamped_c;
2887     c->sum_abs_dctelem = sum_abs_dctelem_c;
2888     c->gmc1 = gmc1_c;
2889     c->gmc = ff_gmc_c;
2890     c->pix_sum = pix_sum_c;
2891     c->pix_norm1 = pix_norm1_c;
2892
2893     c->fill_block_tab[0] = fill_block16_c;
2894     c->fill_block_tab[1] = fill_block8_c;
2895
2896     /* TODO [0] 16  [1] 8 */
2897     c->pix_abs[0][0] = pix_abs16_c;
2898     c->pix_abs[0][1] = pix_abs16_x2_c;
2899     c->pix_abs[0][2] = pix_abs16_y2_c;
2900     c->pix_abs[0][3] = pix_abs16_xy2_c;
2901     c->pix_abs[1][0] = pix_abs8_c;
2902     c->pix_abs[1][1] = pix_abs8_x2_c;
2903     c->pix_abs[1][2] = pix_abs8_y2_c;
2904     c->pix_abs[1][3] = pix_abs8_xy2_c;
2905
2906     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2907     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2908     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2909     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2910     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2911     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2912     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2913     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2914     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2915
2916     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2917     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2918     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2919     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2920     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2921     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2922     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2923     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2924     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2925
2926 #define dspfunc(PFX, IDX, NUM) \
2927     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2928     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2929     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2930     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2931     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2932     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2933     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2934     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2935     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2936     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2937     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2938     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2939     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2940     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2941     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2942     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2943
2944     dspfunc(put_qpel, 0, 16);
2945     dspfunc(put_no_rnd_qpel, 0, 16);
2946
2947     dspfunc(avg_qpel, 0, 16);
2948     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2949
2950     dspfunc(put_qpel, 1, 8);
2951     dspfunc(put_no_rnd_qpel, 1, 8);
2952
2953     dspfunc(avg_qpel, 1, 8);
2954     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2955
2956 #undef dspfunc
2957
2958     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2959     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2960     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2961     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2962     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2963     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2964     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2965     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2966
2967 #define SET_CMP_FUNC(name) \
2968     c->name[0]= name ## 16_c;\
2969     c->name[1]= name ## 8x8_c;
2970
2971     SET_CMP_FUNC(hadamard8_diff)
2972     c->hadamard8_diff[4]= hadamard8_intra16_c;
2973     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2974     SET_CMP_FUNC(dct_sad)
2975     SET_CMP_FUNC(dct_max)
2976 #if CONFIG_GPL
2977     SET_CMP_FUNC(dct264_sad)
2978 #endif
2979     c->sad[0]= pix_abs16_c;
2980     c->sad[1]= pix_abs8_c;
2981     c->sse[0]= sse16_c;
2982     c->sse[1]= sse8_c;
2983     c->sse[2]= sse4_c;
2984     SET_CMP_FUNC(quant_psnr)
2985     SET_CMP_FUNC(rd)
2986     SET_CMP_FUNC(bit)
2987     c->vsad[0]= vsad16_c;
2988     c->vsad[4]= vsad_intra16_c;
2989     c->vsad[5]= vsad_intra8_c;
2990     c->vsse[0]= vsse16_c;
2991     c->vsse[4]= vsse_intra16_c;
2992     c->vsse[5]= vsse_intra8_c;
2993     c->nsse[0]= nsse16_c;
2994     c->nsse[1]= nsse8_c;
2995 #if CONFIG_DWT
2996     ff_dsputil_init_dwt(c);
2997 #endif
2998
2999     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3000
3001     c->add_bytes= add_bytes_c;
3002     c->diff_bytes= diff_bytes_c;
3003     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3004     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3005     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
3006     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3007     c->bswap_buf= bswap_buf;
3008     c->bswap16_buf = bswap16_buf;
3009
3010     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3011         c->h263_h_loop_filter= h263_h_loop_filter_c;
3012         c->h263_v_loop_filter= h263_v_loop_filter_c;
3013     }
3014
3015     c->h261_loop_filter= h261_loop_filter_c;
3016
3017     c->try_8x8basis= try_8x8basis_c;
3018     c->add_8x8basis= add_8x8basis_c;
3019
3020 #if CONFIG_VORBIS_DECODER
3021     c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling;
3022 #endif
3023     c->vector_fmul_reverse = vector_fmul_reverse_c;
3024     c->vector_fmul_add = vector_fmul_add_c;
3025     c->vector_fmul_window = vector_fmul_window_c;
3026     c->vector_clipf = vector_clipf_c;
3027     c->scalarproduct_int16 = scalarproduct_int16_c;
3028     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3029     c->apply_window_int16 = apply_window_int16_c;
3030     c->vector_clip_int32 = vector_clip_int32_c;
3031     c->scalarproduct_float = ff_scalarproduct_float_c;
3032     c->butterflies_float = butterflies_float_c;
3033     c->butterflies_float_interleave = butterflies_float_interleave_c;
3034
3035     c->shrink[0]= av_image_copy_plane;
3036     c->shrink[1]= ff_shrink22;
3037     c->shrink[2]= ff_shrink44;
3038     c->shrink[3]= ff_shrink88;
3039
3040     c->prefetch= just_return;
3041
3042     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3043     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3044
3045 #undef FUNC
3046 #undef FUNCC
3047 #define FUNC(f, depth) f ## _ ## depth
3048 #define FUNCC(f, depth) f ## _ ## depth ## _c
3049
3050 #define dspfunc1(PFX, IDX, NUM, depth)\
3051     c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
3052     c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3053     c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3054     c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3055
3056 #define dspfunc2(PFX, IDX, NUM, depth)\
3057     c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3058     c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3059     c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3060     c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3061     c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3062     c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3063     c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3064     c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3065     c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3066     c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3067     c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3068     c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3069     c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3070     c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3071     c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3072     c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3073
3074
3075 #define BIT_DEPTH_FUNCS(depth, dct)\
3076     c->get_pixels                    = FUNCC(get_pixels   ## dct   , depth);\
3077     c->draw_edges                    = FUNCC(draw_edges            , depth);\
3078     c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
3079     c->clear_block                   = FUNCC(clear_block  ## dct   , depth);\
3080     c->clear_blocks                  = FUNCC(clear_blocks ## dct   , depth);\
3081     c->add_pixels8                   = FUNCC(add_pixels8  ## dct   , depth);\
3082     c->add_pixels4                   = FUNCC(add_pixels4  ## dct   , depth);\
3083     c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
3084     c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3085 \
3086     c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
3087     c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
3088     c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
3089     c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
3090     c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
3091     c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
3092 \
3093     dspfunc1(put       , 0, 16, depth);\
3094     dspfunc1(put       , 1,  8, depth);\
3095     dspfunc1(put       , 2,  4, depth);\
3096     dspfunc1(put       , 3,  2, depth);\
3097     dspfunc1(put_no_rnd, 0, 16, depth);\
3098     dspfunc1(put_no_rnd, 1,  8, depth);\
3099     dspfunc1(avg       , 0, 16, depth);\
3100     dspfunc1(avg       , 1,  8, depth);\
3101     dspfunc1(avg       , 2,  4, depth);\
3102     dspfunc1(avg       , 3,  2, depth);\
3103     dspfunc1(avg_no_rnd, 0, 16, depth);\
3104     dspfunc1(avg_no_rnd, 1,  8, depth);\
3105 \
3106     dspfunc2(put_h264_qpel, 0, 16, depth);\
3107     dspfunc2(put_h264_qpel, 1,  8, depth);\
3108     dspfunc2(put_h264_qpel, 2,  4, depth);\
3109     dspfunc2(put_h264_qpel, 3,  2, depth);\
3110     dspfunc2(avg_h264_qpel, 0, 16, depth);\
3111     dspfunc2(avg_h264_qpel, 1,  8, depth);\
3112     dspfunc2(avg_h264_qpel, 2,  4, depth);
3113
3114     switch (avctx->bits_per_raw_sample) {
3115     case 9:
3116         if (c->dct_bits == 32) {
3117             BIT_DEPTH_FUNCS(9, _32);
3118         } else {
3119             BIT_DEPTH_FUNCS(9, _16);
3120         }
3121         break;
3122     case 10:
3123         if (c->dct_bits == 32) {
3124             BIT_DEPTH_FUNCS(10, _32);
3125         } else {
3126             BIT_DEPTH_FUNCS(10, _16);
3127         }
3128         break;
3129     case 12:
3130         if (c->dct_bits == 32) {
3131             BIT_DEPTH_FUNCS(12, _32);
3132         } else {
3133             BIT_DEPTH_FUNCS(12, _16);
3134         }
3135         break;
3136     case 14:
3137         if (c->dct_bits == 32) {
3138             BIT_DEPTH_FUNCS(14, _32);
3139         } else {
3140             BIT_DEPTH_FUNCS(14, _16);
3141         }
3142         break;
3143     default:
3144         if(avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
3145             BIT_DEPTH_FUNCS(8, _16);
3146         }
3147         break;
3148     }
3149
3150
3151     if (HAVE_MMX)        ff_dsputil_init_mmx   (c, avctx);
3152     if (ARCH_ARM)        ff_dsputil_init_arm   (c, avctx);
3153     if (HAVE_VIS)        ff_dsputil_init_vis   (c, avctx);
3154     if (ARCH_ALPHA)      ff_dsputil_init_alpha (c, avctx);
3155     if (ARCH_PPC)        ff_dsputil_init_ppc   (c, avctx);
3156     if (ARCH_SH4)        ff_dsputil_init_sh4   (c, avctx);
3157     if (ARCH_BFIN)       ff_dsputil_init_bfin  (c, avctx);
3158     if (HAVE_MIPSFPU)    ff_dsputil_init_mips  (c, avctx);
3159
3160     for (i = 0; i < 4; i++) {
3161         for (j = 0; j < 16; j++) {
3162             if(!c->put_2tap_qpel_pixels_tab[i][j])
3163                 c->put_2tap_qpel_pixels_tab[i][j] =
3164                     c->put_h264_qpel_pixels_tab[i][j];
3165             if(!c->avg_2tap_qpel_pixels_tab[i][j])
3166                 c->avg_2tap_qpel_pixels_tab[i][j] =
3167                     c->avg_h264_qpel_pixels_tab[i][j];
3168         }
3169     }
3170
3171     ff_init_scantable_permutation(c->idct_permutation,
3172                                   c->idct_permutation_type);
3173 }
3174
3175 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3176 {
3177     ff_dsputil_init(c, avctx);
3178 }