git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41
  42 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  43 uint32_t ff_squareTbl[512] = {0, };
  44
  45 #define BIT_DEPTH 9
  46 #include "dsputil_template.c"
  47 #undef BIT_DEPTH
  48
  49 #define BIT_DEPTH 10
  50 #include "dsputil_template.c"
  51 #undef BIT_DEPTH
  52
  53 #define BIT_DEPTH 8
  54 #include "dsputil_template.c"
  55
  56 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  57 #define pb_7f (~0UL/255 * 0x7f)
  58 #define pb_80 (~0UL/255 * 0x80)
  59
  60 const uint8_t ff_zigzag_direct[64] = {
  61     0,   1,  8, 16,  9,  2,  3, 10,
  62     17, 24, 32, 25, 18, 11,  4,  5,
  63     12, 19, 26, 33, 40, 48, 41, 34,
  64     27, 20, 13,  6,  7, 14, 21, 28,
  65     35, 42, 49, 56, 57, 50, 43, 36,
  66     29, 22, 15, 23, 30, 37, 44, 51,
  67     58, 59, 52, 45, 38, 31, 39, 46,
  68     53, 60, 61, 54, 47, 55, 62, 63
  69 };
  70
  71 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  72    specification, we interleave the fields */
  73 const uint8_t ff_zigzag248_direct[64] = {
  74      0,  8,  1,  9, 16, 24,  2, 10,
  75     17, 25, 32, 40, 48, 56, 33, 41,
  76     18, 26,  3, 11,  4, 12, 19, 27,
  77     34, 42, 49, 57, 50, 58, 35, 43,
  78     20, 28,  5, 13,  6, 14, 21, 29,
  79     36, 44, 51, 59, 52, 60, 37, 45,
  80     22, 30,  7, 15, 23, 31, 38, 46,
  81     53, 61, 54, 62, 39, 47, 55, 63,
  82 };
  83
  84 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  85 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
  86
  87 const uint8_t ff_alternate_horizontal_scan[64] = {
  88     0,  1,   2,  3,  8,  9, 16, 17,
  89     10, 11,  4,  5,  6,  7, 15, 14,
  90     13, 12, 19, 18, 24, 25, 32, 33,
  91     26, 27, 20, 21, 22, 23, 28, 29,
  92     30, 31, 34, 35, 40, 41, 48, 49,
  93     42, 43, 36, 37, 38, 39, 44, 45,
  94     46, 47, 50, 51, 56, 57, 58, 59,
  95     52, 53, 54, 55, 60, 61, 62, 63,
  96 };
  97
  98 const uint8_t ff_alternate_vertical_scan[64] = {
  99     0,  8,  16, 24,  1,  9,  2, 10,
 100     17, 25, 32, 40, 48, 56, 57, 49,
 101     41, 33, 26, 18,  3, 11,  4, 12,
 102     19, 27, 34, 42, 50, 58, 35, 43,
 103     51, 59, 20, 28,  5, 13,  6, 14,
 104     21, 29, 36, 44, 52, 60, 37, 45,
 105     53, 61, 22, 30,  7, 15, 23, 31,
 106     38, 46, 54, 62, 39, 47, 55, 63,
 107 };
 108
 109 /* Input permutation for the simple_idct_mmx */
 110 static const uint8_t simple_mmx_permutation[64]={
 111         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 112         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 113         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 114         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 115         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 116         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 117         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 118         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 119 };
 120
 121 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 122
 123 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 124     int i;
 125     int end;
 126
 127     st->scantable= src_scantable;
 128
 129     for(i=0; i<64; i++){
 130         int j;
 131         j = src_scantable[i];
 132         st->permutated[i] = permutation[j];
 133 #if ARCH_PPC
 134         st->inverse[j] = i;
 135 #endif
 136     }
 137
 138     end=-1;
 139     for(i=0; i<64; i++){
 140         int j;
 141         j = st->permutated[i];
 142         if(j>end) end=j;
 143         st->raster_end[i]= end;
 144     }
 145 }
 146
 147 void ff_init_scantable_permutation(uint8_t *idct_permutation,
 148                                    int idct_permutation_type)
 149 {
 150     int i;
 151
 152     switch(idct_permutation_type){
 153     case FF_NO_IDCT_PERM:
 154         for(i=0; i<64; i++)
 155             idct_permutation[i]= i;
 156         break;
 157     case FF_LIBMPEG2_IDCT_PERM:
 158         for(i=0; i<64; i++)
 159             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 160         break;
 161     case FF_SIMPLE_IDCT_PERM:
 162         for(i=0; i<64; i++)
 163             idct_permutation[i]= simple_mmx_permutation[i];
 164         break;
 165     case FF_TRANSPOSE_IDCT_PERM:
 166         for(i=0; i<64; i++)
 167             idct_permutation[i]= ((i&7)<<3) | (i>>3);
 168         break;
 169     case FF_PARTTRANS_IDCT_PERM:
 170         for(i=0; i<64; i++)
 171             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
 172         break;
 173     case FF_SSE2_IDCT_PERM:
 174         for(i=0; i<64; i++)
 175             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
 176         break;
 177     default:
 178         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
 179     }
 180 }
 181
 182 static int pix_sum_c(uint8_t * pix, int line_size)
 183 {
 184     int s, i, j;
 185
 186     s = 0;
 187     for (i = 0; i < 16; i++) {
 188         for (j = 0; j < 16; j += 8) {
 189             s += pix[0];
 190             s += pix[1];
 191             s += pix[2];
 192             s += pix[3];
 193             s += pix[4];
 194             s += pix[5];
 195             s += pix[6];
 196             s += pix[7];
 197             pix += 8;
 198         }
 199         pix += line_size - 16;
 200     }
 201     return s;
 202 }
 203
 204 static int pix_norm1_c(uint8_t * pix, int line_size)
 205 {
 206     int s, i, j;
 207     uint32_t *sq = ff_squareTbl + 256;
 208
 209     s = 0;
 210     for (i = 0; i < 16; i++) {
 211         for (j = 0; j < 16; j += 8) {
 212 #if 0
 213             s += sq[pix[0]];
 214             s += sq[pix[1]];
 215             s += sq[pix[2]];
 216             s += sq[pix[3]];
 217             s += sq[pix[4]];
 218             s += sq[pix[5]];
 219             s += sq[pix[6]];
 220             s += sq[pix[7]];
 221 #else
 222 #if HAVE_FAST_64BIT
 223             register uint64_t x=*(uint64_t*)pix;
 224             s += sq[x&0xff];
 225             s += sq[(x>>8)&0xff];
 226             s += sq[(x>>16)&0xff];
 227             s += sq[(x>>24)&0xff];
 228             s += sq[(x>>32)&0xff];
 229             s += sq[(x>>40)&0xff];
 230             s += sq[(x>>48)&0xff];
 231             s += sq[(x>>56)&0xff];
 232 #else
 233             register uint32_t x=*(uint32_t*)pix;
 234             s += sq[x&0xff];
 235             s += sq[(x>>8)&0xff];
 236             s += sq[(x>>16)&0xff];
 237             s += sq[(x>>24)&0xff];
 238             x=*(uint32_t*)(pix+4);
 239             s += sq[x&0xff];
 240             s += sq[(x>>8)&0xff];
 241             s += sq[(x>>16)&0xff];
 242             s += sq[(x>>24)&0xff];
 243 #endif
 244 #endif
 245             pix += 8;
 246         }
 247         pix += line_size - 16;
 248     }
 249     return s;
 250 }
 251
 252 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 253     int i;
 254
 255     for(i=0; i+8<=w; i+=8){
 256         dst[i+0]= av_bswap32(src[i+0]);
 257         dst[i+1]= av_bswap32(src[i+1]);
 258         dst[i+2]= av_bswap32(src[i+2]);
 259         dst[i+3]= av_bswap32(src[i+3]);
 260         dst[i+4]= av_bswap32(src[i+4]);
 261         dst[i+5]= av_bswap32(src[i+5]);
 262         dst[i+6]= av_bswap32(src[i+6]);
 263         dst[i+7]= av_bswap32(src[i+7]);
 264     }
 265     for(;i<w; i++){
 266         dst[i+0]= av_bswap32(src[i+0]);
 267     }
 268 }
 269
 270 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 271 {
 272     while (len--)
 273         *dst++ = av_bswap16(*src++);
 274 }
 275
 276 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 277 {
 278     int s, i;
 279     uint32_t *sq = ff_squareTbl + 256;
 280
 281     s = 0;
 282     for (i = 0; i < h; i++) {
 283         s += sq[pix1[0] - pix2[0]];
 284         s += sq[pix1[1] - pix2[1]];
 285         s += sq[pix1[2] - pix2[2]];
 286         s += sq[pix1[3] - pix2[3]];
 287         pix1 += line_size;
 288         pix2 += line_size;
 289     }
 290     return s;
 291 }
 292
 293 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 294 {
 295     int s, i;
 296     uint32_t *sq = ff_squareTbl + 256;
 297
 298     s = 0;
 299     for (i = 0; i < h; i++) {
 300         s += sq[pix1[0] - pix2[0]];
 301         s += sq[pix1[1] - pix2[1]];
 302         s += sq[pix1[2] - pix2[2]];
 303         s += sq[pix1[3] - pix2[3]];
 304         s += sq[pix1[4] - pix2[4]];
 305         s += sq[pix1[5] - pix2[5]];
 306         s += sq[pix1[6] - pix2[6]];
 307         s += sq[pix1[7] - pix2[7]];
 308         pix1 += line_size;
 309         pix2 += line_size;
 310     }
 311     return s;
 312 }
 313
 314 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 315 {
 316     int s, i;
 317     uint32_t *sq = ff_squareTbl + 256;
 318
 319     s = 0;
 320     for (i = 0; i < h; i++) {
 321         s += sq[pix1[ 0] - pix2[ 0]];
 322         s += sq[pix1[ 1] - pix2[ 1]];
 323         s += sq[pix1[ 2] - pix2[ 2]];
 324         s += sq[pix1[ 3] - pix2[ 3]];
 325         s += sq[pix1[ 4] - pix2[ 4]];
 326         s += sq[pix1[ 5] - pix2[ 5]];
 327         s += sq[pix1[ 6] - pix2[ 6]];
 328         s += sq[pix1[ 7] - pix2[ 7]];
 329         s += sq[pix1[ 8] - pix2[ 8]];
 330         s += sq[pix1[ 9] - pix2[ 9]];
 331         s += sq[pix1[10] - pix2[10]];
 332         s += sq[pix1[11] - pix2[11]];
 333         s += sq[pix1[12] - pix2[12]];
 334         s += sq[pix1[13] - pix2[13]];
 335         s += sq[pix1[14] - pix2[14]];
 336         s += sq[pix1[15] - pix2[15]];
 337
 338         pix1 += line_size;
 339         pix2 += line_size;
 340     }
 341     return s;
 342 }
 343
 344 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 345                           const uint8_t *s2, int stride){
 346     int i;
 347
 348     /* read the pixels */
 349     for(i=0;i<8;i++) {
 350         block[0] = s1[0] - s2[0];
 351         block[1] = s1[1] - s2[1];
 352         block[2] = s1[2] - s2[2];
 353         block[3] = s1[3] - s2[3];
 354         block[4] = s1[4] - s2[4];
 355         block[5] = s1[5] - s2[5];
 356         block[6] = s1[6] - s2[6];
 357         block[7] = s1[7] - s2[7];
 358         s1 += stride;
 359         s2 += stride;
 360         block += 8;
 361     }
 362 }
 363
 364
 365 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 366                              int line_size)
 367 {
 368     int i;
 369     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 370
 371     /* read the pixels */
 372     for(i=0;i<8;i++) {
 373         pixels[0] = cm[block[0]];
 374         pixels[1] = cm[block[1]];
 375         pixels[2] = cm[block[2]];
 376         pixels[3] = cm[block[3]];
 377         pixels[4] = cm[block[4]];
 378         pixels[5] = cm[block[5]];
 379         pixels[6] = cm[block[6]];
 380         pixels[7] = cm[block[7]];
 381
 382         pixels += line_size;
 383         block += 8;
 384     }
 385 }
 386
 387 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 388                                  int line_size)
 389 {
 390     int i;
 391     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 392
 393     /* read the pixels */
 394     for(i=0;i<4;i++) {
 395         pixels[0] = cm[block[0]];
 396         pixels[1] = cm[block[1]];
 397         pixels[2] = cm[block[2]];
 398         pixels[3] = cm[block[3]];
 399
 400         pixels += line_size;
 401         block += 8;
 402     }
 403 }
 404
 405 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 406                                  int line_size)
 407 {
 408     int i;
 409     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 410
 411     /* read the pixels */
 412     for(i=0;i<2;i++) {
 413         pixels[0] = cm[block[0]];
 414         pixels[1] = cm[block[1]];
 415
 416         pixels += line_size;
 417         block += 8;
 418     }
 419 }
 420
 421 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
 422                                     uint8_t *restrict pixels,
 423                                     int line_size)
 424 {
 425     int i, j;
 426
 427     for (i = 0; i < 8; i++) {
 428         for (j = 0; j < 8; j++) {
 429             if (*block < -128)
 430                 *pixels = 0;
 431             else if (*block > 127)
 432                 *pixels = 255;
 433             else
 434                 *pixels = (uint8_t)(*block + 128);
 435             block++;
 436             pixels++;
 437         }
 438         pixels += (line_size - 8);
 439     }
 440 }
 441
 442 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 443                              int line_size)
 444 {
 445     int i;
 446     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 447
 448     /* read the pixels */
 449     for(i=0;i<8;i++) {
 450         pixels[0] = cm[pixels[0] + block[0]];
 451         pixels[1] = cm[pixels[1] + block[1]];
 452         pixels[2] = cm[pixels[2] + block[2]];
 453         pixels[3] = cm[pixels[3] + block[3]];
 454         pixels[4] = cm[pixels[4] + block[4]];
 455         pixels[5] = cm[pixels[5] + block[5]];
 456         pixels[6] = cm[pixels[6] + block[6]];
 457         pixels[7] = cm[pixels[7] + block[7]];
 458         pixels += line_size;
 459         block += 8;
 460     }
 461 }
 462
 463 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 464                           int line_size)
 465 {
 466     int i;
 467     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 468
 469     /* read the pixels */
 470     for(i=0;i<4;i++) {
 471         pixels[0] = cm[pixels[0] + block[0]];
 472         pixels[1] = cm[pixels[1] + block[1]];
 473         pixels[2] = cm[pixels[2] + block[2]];
 474         pixels[3] = cm[pixels[3] + block[3]];
 475         pixels += line_size;
 476         block += 8;
 477     }
 478 }
 479
 480 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 481                           int line_size)
 482 {
 483     int i;
 484     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 485
 486     /* read the pixels */
 487     for(i=0;i<2;i++) {
 488         pixels[0] = cm[pixels[0] + block[0]];
 489         pixels[1] = cm[pixels[1] + block[1]];
 490         pixels += line_size;
 491         block += 8;
 492     }
 493 }
 494
 495 static int sum_abs_dctelem_c(DCTELEM *block)
 496 {
 497     int sum=0, i;
 498     for(i=0; i<64; i++)
 499         sum+= FFABS(block[i]);
 500     return sum;
 501 }
 502
 503 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 504 {
 505     int i;
 506
 507     for (i = 0; i < h; i++) {
 508         memset(block, value, 16);
 509         block += line_size;
 510     }
 511 }
 512
 513 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 514 {
 515     int i;
 516
 517     for (i = 0; i < h; i++) {
 518         memset(block, value, 8);
 519         block += line_size;
 520     }
 521 }
 522
 523 #define avg2(a,b) ((a+b+1)>>1)
 524 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 525
 526 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 527 {
 528     const int A=(16-x16)*(16-y16);
 529     const int B=(   x16)*(16-y16);
 530     const int C=(16-x16)*(   y16);
 531     const int D=(   x16)*(   y16);
 532     int i;
 533
 534     for(i=0; i<h; i++)
 535     {
 536         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 537         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 538         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 539         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 540         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 541         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 542         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 543         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 544         dst+= stride;
 545         src+= stride;
 546     }
 547 }
 548
 549 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 550                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 551 {
 552     int y, vx, vy;
 553     const int s= 1<<shift;
 554
 555     width--;
 556     height--;
 557
 558     for(y=0; y<h; y++){
 559         int x;
 560
 561         vx= ox;
 562         vy= oy;
 563         for(x=0; x<8; x++){ //XXX FIXME optimize
 564             int src_x, src_y, frac_x, frac_y, index;
 565
 566             src_x= vx>>16;
 567             src_y= vy>>16;
 568             frac_x= src_x&(s-1);
 569             frac_y= src_y&(s-1);
 570             src_x>>=shift;
 571             src_y>>=shift;
 572
 573             if((unsigned)src_x < width){
 574                 if((unsigned)src_y < height){
 575                     index= src_x + src_y*stride;
 576                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 577                                            + src[index       +1]*   frac_x )*(s-frac_y)
 578                                         + (  src[index+stride  ]*(s-frac_x)
 579                                            + src[index+stride+1]*   frac_x )*   frac_y
 580                                         + r)>>(shift*2);
 581                 }else{
 582                     index= src_x + av_clip(src_y, 0, height)*stride;
 583                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 584                                           + src[index       +1]*   frac_x )*s
 585                                         + r)>>(shift*2);
 586                 }
 587             }else{
 588                 if((unsigned)src_y < height){
 589                     index= av_clip(src_x, 0, width) + src_y*stride;
 590                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 591                                            + src[index+stride  ]*   frac_y )*s
 592                                         + r)>>(shift*2);
 593                 }else{
 594                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 595                     dst[y*stride + x]=    src[index         ];
 596                 }
 597             }
 598
 599             vx+= dxx;
 600             vy+= dyx;
 601         }
 602         ox += dxy;
 603         oy += dyy;
 604     }
 605 }
 606
 607 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 608     switch(width){
 609     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 610     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 611     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 612     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 613     }
 614 }
 615
 616 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 617     int i,j;
 618     for (i=0; i < height; i++) {
 619       for (j=0; j < width; j++) {
 620         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 621       }
 622       src += stride;
 623       dst += stride;
 624     }
 625 }
 626
 627 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 628     int i,j;
 629     for (i=0; i < height; i++) {
 630       for (j=0; j < width; j++) {
 631         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 632       }
 633       src += stride;
 634       dst += stride;
 635     }
 636 }
 637
 638 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 639     int i,j;
 640     for (i=0; i < height; i++) {
 641       for (j=0; j < width; j++) {
 642         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 643       }
 644       src += stride;
 645       dst += stride;
 646     }
 647 }
 648
 649 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 650     int i,j;
 651     for (i=0; i < height; i++) {
 652       for (j=0; j < width; j++) {
 653         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 654       }
 655       src += stride;
 656       dst += stride;
 657     }
 658 }
 659
 660 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 661     int i,j;
 662     for (i=0; i < height; i++) {
 663       for (j=0; j < width; j++) {
 664         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 665       }
 666       src += stride;
 667       dst += stride;
 668     }
 669 }
 670
 671 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 672     int i,j;
 673     for (i=0; i < height; i++) {
 674       for (j=0; j < width; j++) {
 675         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 676       }
 677       src += stride;
 678       dst += stride;
 679     }
 680 }
 681
 682 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 683     int i,j;
 684     for (i=0; i < height; i++) {
 685       for (j=0; j < width; j++) {
 686         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 687       }
 688       src += stride;
 689       dst += stride;
 690     }
 691 }
 692
 693 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 694     int i,j;
 695     for (i=0; i < height; i++) {
 696       for (j=0; j < width; j++) {
 697         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 698       }
 699       src += stride;
 700       dst += stride;
 701     }
 702 }
 703
 704 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 705     switch(width){
 706     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 707     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 708     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 709     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 710     }
 711 }
 712
 713 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 714     int i,j;
 715     for (i=0; i < height; i++) {
 716       for (j=0; j < width; j++) {
 717         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 718       }
 719       src += stride;
 720       dst += stride;
 721     }
 722 }
 723
 724 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 725     int i,j;
 726     for (i=0; i < height; i++) {
 727       for (j=0; j < width; j++) {
 728         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 729       }
 730       src += stride;
 731       dst += stride;
 732     }
 733 }
 734
 735 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 736     int i,j;
 737     for (i=0; i < height; i++) {
 738       for (j=0; j < width; j++) {
 739         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 740       }
 741       src += stride;
 742       dst += stride;
 743     }
 744 }
 745
 746 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 747     int i,j;
 748     for (i=0; i < height; i++) {
 749       for (j=0; j < width; j++) {
 750         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 751       }
 752       src += stride;
 753       dst += stride;
 754     }
 755 }
 756
 757 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 758     int i,j;
 759     for (i=0; i < height; i++) {
 760       for (j=0; j < width; j++) {
 761         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 762       }
 763       src += stride;
 764       dst += stride;
 765     }
 766 }
 767
 768 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 769     int i,j;
 770     for (i=0; i < height; i++) {
 771       for (j=0; j < width; j++) {
 772         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 773       }
 774       src += stride;
 775       dst += stride;
 776     }
 777 }
 778
 779 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 780     int i,j;
 781     for (i=0; i < height; i++) {
 782       for (j=0; j < width; j++) {
 783         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 784       }
 785       src += stride;
 786       dst += stride;
 787     }
 788 }
 789
 790 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 791     int i,j;
 792     for (i=0; i < height; i++) {
 793       for (j=0; j < width; j++) {
 794         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 795       }
 796       src += stride;
 797       dst += stride;
 798     }
 799 }
 800
 801 #define QPEL_MC(r, OPNAME, RND, OP) \
 802 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 803     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 804     int i;\
 805     for(i=0; i<h; i++)\
 806     {\
 807         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 808         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 809         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 810         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 811         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 812         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 813         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 814         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 815         dst+=dstStride;\
 816         src+=srcStride;\
 817     }\
 818 }\
 819 \
 820 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 821     const int w=8;\
 822     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 823     int i;\
 824     for(i=0; i<w; i++)\
 825     {\
 826         const int src0= src[0*srcStride];\
 827         const int src1= src[1*srcStride];\
 828         const int src2= src[2*srcStride];\
 829         const int src3= src[3*srcStride];\
 830         const int src4= src[4*srcStride];\
 831         const int src5= src[5*srcStride];\
 832         const int src6= src[6*srcStride];\
 833         const int src7= src[7*srcStride];\
 834         const int src8= src[8*srcStride];\
 835         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 836         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 837         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 838         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 839         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 840         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 841         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 842         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 843         dst++;\
 844         src++;\
 845     }\
 846 }\
 847 \
 848 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 849     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 850     int i;\
 851     \
 852     for(i=0; i<h; i++)\
 853     {\
 854         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 855         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 856         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 857         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 858         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 859         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 860         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 861         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 862         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 863         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 864         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 865         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 866         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 867         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 868         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 869         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 870         dst+=dstStride;\
 871         src+=srcStride;\
 872     }\
 873 }\
 874 \
 875 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 876     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 877     int i;\
 878     const int w=16;\
 879     for(i=0; i<w; i++)\
 880     {\
 881         const int src0= src[0*srcStride];\
 882         const int src1= src[1*srcStride];\
 883         const int src2= src[2*srcStride];\
 884         const int src3= src[3*srcStride];\
 885         const int src4= src[4*srcStride];\
 886         const int src5= src[5*srcStride];\
 887         const int src6= src[6*srcStride];\
 888         const int src7= src[7*srcStride];\
 889         const int src8= src[8*srcStride];\
 890         const int src9= src[9*srcStride];\
 891         const int src10= src[10*srcStride];\
 892         const int src11= src[11*srcStride];\
 893         const int src12= src[12*srcStride];\
 894         const int src13= src[13*srcStride];\
 895         const int src14= src[14*srcStride];\
 896         const int src15= src[15*srcStride];\
 897         const int src16= src[16*srcStride];\
 898         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 899         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 900         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 901         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 902         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 903         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 904         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 905         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 906         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 907         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 908         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 909         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 910         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 911         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 912         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 913         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 914         dst++;\
 915         src++;\
 916     }\
 917 }\
 918 \
 919 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
 920     uint8_t half[64];\
 921     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 922     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 923 }\
 924 \
 925 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
 926     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 927 }\
 928 \
 929 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
 930     uint8_t half[64];\
 931     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 932     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 933 }\
 934 \
 935 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
 936     uint8_t full[16*9];\
 937     uint8_t half[64];\
 938     copy_block9(full, src, 16, stride, 9);\
 939     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 940     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 941 }\
 942 \
 943 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
 944     uint8_t full[16*9];\
 945     copy_block9(full, src, 16, stride, 9);\
 946     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 947 }\
 948 \
 949 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
 950     uint8_t full[16*9];\
 951     uint8_t half[64];\
 952     copy_block9(full, src, 16, stride, 9);\
 953     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 954     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 955 }\
 956 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
 957     uint8_t full[16*9];\
 958     uint8_t halfH[72];\
 959     uint8_t halfV[64];\
 960     uint8_t halfHV[64];\
 961     copy_block9(full, src, 16, stride, 9);\
 962     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 963     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 964     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 965     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 966 }\
 967 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
 968     uint8_t full[16*9];\
 969     uint8_t halfH[72];\
 970     uint8_t halfHV[64];\
 971     copy_block9(full, src, 16, stride, 9);\
 972     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 973     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 974     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 975     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 976 }\
 977 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
 978     uint8_t full[16*9];\
 979     uint8_t halfH[72];\
 980     uint8_t halfV[64];\
 981     uint8_t halfHV[64];\
 982     copy_block9(full, src, 16, stride, 9);\
 983     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 984     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 985     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 986     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 987 }\
 988 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
 989     uint8_t full[16*9];\
 990     uint8_t halfH[72];\
 991     uint8_t halfHV[64];\
 992     copy_block9(full, src, 16, stride, 9);\
 993     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 994     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 995     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 996     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 997 }\
 998 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
 999     uint8_t full[16*9];\
1000     uint8_t halfH[72];\
1001     uint8_t halfV[64];\
1002     uint8_t halfHV[64];\
1003     copy_block9(full, src, 16, stride, 9);\
1004     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1005     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1006     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1007     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1008 }\
1009 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1010     uint8_t full[16*9];\
1011     uint8_t halfH[72];\
1012     uint8_t halfHV[64];\
1013     copy_block9(full, src, 16, stride, 9);\
1014     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1015     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1016     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1017     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1018 }\
1019 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1020     uint8_t full[16*9];\
1021     uint8_t halfH[72];\
1022     uint8_t halfV[64];\
1023     uint8_t halfHV[64];\
1024     copy_block9(full, src, 16, stride, 9);\
1025     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1026     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1027     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1028     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1029 }\
1030 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1031     uint8_t full[16*9];\
1032     uint8_t halfH[72];\
1033     uint8_t halfHV[64];\
1034     copy_block9(full, src, 16, stride, 9);\
1035     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1036     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1037     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1038     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1039 }\
1040 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1041     uint8_t halfH[72];\
1042     uint8_t halfHV[64];\
1043     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1044     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1045     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1046 }\
1047 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1048     uint8_t halfH[72];\
1049     uint8_t halfHV[64];\
1050     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1051     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1052     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1053 }\
1054 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1055     uint8_t full[16*9];\
1056     uint8_t halfH[72];\
1057     uint8_t halfV[64];\
1058     uint8_t halfHV[64];\
1059     copy_block9(full, src, 16, stride, 9);\
1060     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1061     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1062     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1063     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1064 }\
1065 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1066     uint8_t full[16*9];\
1067     uint8_t halfH[72];\
1068     copy_block9(full, src, 16, stride, 9);\
1069     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1070     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1071     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1072 }\
1073 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1074     uint8_t full[16*9];\
1075     uint8_t halfH[72];\
1076     uint8_t halfV[64];\
1077     uint8_t halfHV[64];\
1078     copy_block9(full, src, 16, stride, 9);\
1079     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1080     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1081     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1082     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1083 }\
1084 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1085     uint8_t full[16*9];\
1086     uint8_t halfH[72];\
1087     copy_block9(full, src, 16, stride, 9);\
1088     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1089     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1090     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1091 }\
1092 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1093     uint8_t halfH[72];\
1094     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1095     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1096 }\
1097 \
1098 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1099     uint8_t half[256];\
1100     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1101     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1102 }\
1103 \
1104 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1105     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1106 }\
1107 \
1108 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1109     uint8_t half[256];\
1110     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1111     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1112 }\
1113 \
1114 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1115     uint8_t full[24*17];\
1116     uint8_t half[256];\
1117     copy_block17(full, src, 24, stride, 17);\
1118     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1119     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1120 }\
1121 \
1122 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1123     uint8_t full[24*17];\
1124     copy_block17(full, src, 24, stride, 17);\
1125     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1126 }\
1127 \
1128 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1129     uint8_t full[24*17];\
1130     uint8_t half[256];\
1131     copy_block17(full, src, 24, stride, 17);\
1132     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1133     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1134 }\
1135 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1136     uint8_t full[24*17];\
1137     uint8_t halfH[272];\
1138     uint8_t halfV[256];\
1139     uint8_t halfHV[256];\
1140     copy_block17(full, src, 24, stride, 17);\
1141     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1142     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1143     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1144     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1145 }\
1146 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1147     uint8_t full[24*17];\
1148     uint8_t halfH[272];\
1149     uint8_t halfHV[256];\
1150     copy_block17(full, src, 24, stride, 17);\
1151     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1152     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1153     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1154     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1155 }\
1156 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1157     uint8_t full[24*17];\
1158     uint8_t halfH[272];\
1159     uint8_t halfV[256];\
1160     uint8_t halfHV[256];\
1161     copy_block17(full, src, 24, stride, 17);\
1162     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1163     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1164     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1165     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1166 }\
1167 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1168     uint8_t full[24*17];\
1169     uint8_t halfH[272];\
1170     uint8_t halfHV[256];\
1171     copy_block17(full, src, 24, stride, 17);\
1172     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1173     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1174     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1175     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1176 }\
1177 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1178     uint8_t full[24*17];\
1179     uint8_t halfH[272];\
1180     uint8_t halfV[256];\
1181     uint8_t halfHV[256];\
1182     copy_block17(full, src, 24, stride, 17);\
1183     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1184     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1185     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1186     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1187 }\
1188 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1189     uint8_t full[24*17];\
1190     uint8_t halfH[272];\
1191     uint8_t halfHV[256];\
1192     copy_block17(full, src, 24, stride, 17);\
1193     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1194     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1195     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1196     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1197 }\
1198 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1199     uint8_t full[24*17];\
1200     uint8_t halfH[272];\
1201     uint8_t halfV[256];\
1202     uint8_t halfHV[256];\
1203     copy_block17(full, src, 24, stride, 17);\
1204     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1205     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1206     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1207     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1208 }\
1209 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1210     uint8_t full[24*17];\
1211     uint8_t halfH[272];\
1212     uint8_t halfHV[256];\
1213     copy_block17(full, src, 24, stride, 17);\
1214     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1215     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1216     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1217     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1218 }\
1219 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1220     uint8_t halfH[272];\
1221     uint8_t halfHV[256];\
1222     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1223     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1224     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1225 }\
1226 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1227     uint8_t halfH[272];\
1228     uint8_t halfHV[256];\
1229     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1230     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1231     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1232 }\
1233 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1234     uint8_t full[24*17];\
1235     uint8_t halfH[272];\
1236     uint8_t halfV[256];\
1237     uint8_t halfHV[256];\
1238     copy_block17(full, src, 24, stride, 17);\
1239     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1240     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1241     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1242     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1243 }\
1244 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1245     uint8_t full[24*17];\
1246     uint8_t halfH[272];\
1247     copy_block17(full, src, 24, stride, 17);\
1248     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1249     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1250     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1251 }\
1252 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1253     uint8_t full[24*17];\
1254     uint8_t halfH[272];\
1255     uint8_t halfV[256];\
1256     uint8_t halfHV[256];\
1257     copy_block17(full, src, 24, stride, 17);\
1258     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1259     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1260     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1261     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1262 }\
1263 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1264     uint8_t full[24*17];\
1265     uint8_t halfH[272];\
1266     copy_block17(full, src, 24, stride, 17);\
1267     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1268     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1269     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1270 }\
1271 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1272     uint8_t halfH[272];\
1273     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1274     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1275 }
1276
1277 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1278 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1279 #define op_put(a, b) a = cm[((b) + 16)>>5]
1280 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1281
1282 QPEL_MC(0, put_       , _       , op_put)
1283 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1284 QPEL_MC(0, avg_       , _       , op_avg)
1285 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1286 #undef op_avg
1287 #undef op_avg_no_rnd
1288 #undef op_put
1289 #undef op_put_no_rnd
1290
1291 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1292 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1293 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1294 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1295 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1296 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1297
1298 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1299     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1300     int i;
1301
1302     for(i=0; i<h; i++){
1303         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1304         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1305         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1306         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1307         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1308         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1309         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1310         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1311         dst+=dstStride;
1312         src+=srcStride;
1313     }
1314 }
1315
1316 #if CONFIG_RV40_DECODER
1317 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1318     put_pixels16_xy2_8_c(dst, src, stride, 16);
1319 }
1320 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1321     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1322 }
1323 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1324     put_pixels8_xy2_8_c(dst, src, stride, 8);
1325 }
1326 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1327     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1328 }
1329 #endif /* CONFIG_RV40_DECODER */
1330
1331 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1332     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1333     int i;
1334
1335     for(i=0; i<w; i++){
1336         const int src_1= src[ -srcStride];
1337         const int src0 = src[0          ];
1338         const int src1 = src[  srcStride];
1339         const int src2 = src[2*srcStride];
1340         const int src3 = src[3*srcStride];
1341         const int src4 = src[4*srcStride];
1342         const int src5 = src[5*srcStride];
1343         const int src6 = src[6*srcStride];
1344         const int src7 = src[7*srcStride];
1345         const int src8 = src[8*srcStride];
1346         const int src9 = src[9*srcStride];
1347         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1348         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1349         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1350         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1351         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1352         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1353         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1354         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1355         src++;
1356         dst++;
1357     }
1358 }
1359
1360 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1361     uint8_t half[64];
1362     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1363     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1364 }
1365
1366 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1367     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1368 }
1369
1370 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1371     uint8_t half[64];
1372     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1373     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1374 }
1375
1376 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1377     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1378 }
1379
1380 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1381     uint8_t halfH[88];
1382     uint8_t halfV[64];
1383     uint8_t halfHV[64];
1384     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1385     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1386     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1387     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1388 }
1389 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1390     uint8_t halfH[88];
1391     uint8_t halfV[64];
1392     uint8_t halfHV[64];
1393     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1394     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1395     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1396     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1397 }
1398 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1399     uint8_t halfH[88];
1400     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1401     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1402 }
1403
1404 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1405     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1406     int x;
1407     const int strength= ff_h263_loop_filter_strength[qscale];
1408
1409     for(x=0; x<8; x++){
1410         int d1, d2, ad1;
1411         int p0= src[x-2*stride];
1412         int p1= src[x-1*stride];
1413         int p2= src[x+0*stride];
1414         int p3= src[x+1*stride];
1415         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1416
1417         if     (d<-2*strength) d1= 0;
1418         else if(d<-  strength) d1=-2*strength - d;
1419         else if(d<   strength) d1= d;
1420         else if(d< 2*strength) d1= 2*strength - d;
1421         else                   d1= 0;
1422
1423         p1 += d1;
1424         p2 -= d1;
1425         if(p1&256) p1= ~(p1>>31);
1426         if(p2&256) p2= ~(p2>>31);
1427
1428         src[x-1*stride] = p1;
1429         src[x+0*stride] = p2;
1430
1431         ad1= FFABS(d1)>>1;
1432
1433         d2= av_clip((p0-p3)/4, -ad1, ad1);
1434
1435         src[x-2*stride] = p0 - d2;
1436         src[x+  stride] = p3 + d2;
1437     }
1438     }
1439 }
1440
1441 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1442     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1443     int y;
1444     const int strength= ff_h263_loop_filter_strength[qscale];
1445
1446     for(y=0; y<8; y++){
1447         int d1, d2, ad1;
1448         int p0= src[y*stride-2];
1449         int p1= src[y*stride-1];
1450         int p2= src[y*stride+0];
1451         int p3= src[y*stride+1];
1452         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1453
1454         if     (d<-2*strength) d1= 0;
1455         else if(d<-  strength) d1=-2*strength - d;
1456         else if(d<   strength) d1= d;
1457         else if(d< 2*strength) d1= 2*strength - d;
1458         else                   d1= 0;
1459
1460         p1 += d1;
1461         p2 -= d1;
1462         if(p1&256) p1= ~(p1>>31);
1463         if(p2&256) p2= ~(p2>>31);
1464
1465         src[y*stride-1] = p1;
1466         src[y*stride+0] = p2;
1467
1468         ad1= FFABS(d1)>>1;
1469
1470         d2= av_clip((p0-p3)/4, -ad1, ad1);
1471
1472         src[y*stride-2] = p0 - d2;
1473         src[y*stride+1] = p3 + d2;
1474     }
1475     }
1476 }
1477
1478 static void h261_loop_filter_c(uint8_t *src, int stride){
1479     int x,y,xy,yz;
1480     int temp[64];
1481
1482     for(x=0; x<8; x++){
1483         temp[x      ] = 4*src[x           ];
1484         temp[x + 7*8] = 4*src[x + 7*stride];
1485     }
1486     for(y=1; y<7; y++){
1487         for(x=0; x<8; x++){
1488             xy = y * stride + x;
1489             yz = y * 8 + x;
1490             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1491         }
1492     }
1493
1494     for(y=0; y<8; y++){
1495         src[  y*stride] = (temp[  y*8] + 2)>>2;
1496         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1497         for(x=1; x<7; x++){
1498             xy = y * stride + x;
1499             yz = y * 8 + x;
1500             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1501         }
1502     }
1503 }
1504
1505 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1506 {
1507     int s, i;
1508
1509     s = 0;
1510     for(i=0;i<h;i++) {
1511         s += abs(pix1[0] - pix2[0]);
1512         s += abs(pix1[1] - pix2[1]);
1513         s += abs(pix1[2] - pix2[2]);
1514         s += abs(pix1[3] - pix2[3]);
1515         s += abs(pix1[4] - pix2[4]);
1516         s += abs(pix1[5] - pix2[5]);
1517         s += abs(pix1[6] - pix2[6]);
1518         s += abs(pix1[7] - pix2[7]);
1519         s += abs(pix1[8] - pix2[8]);
1520         s += abs(pix1[9] - pix2[9]);
1521         s += abs(pix1[10] - pix2[10]);
1522         s += abs(pix1[11] - pix2[11]);
1523         s += abs(pix1[12] - pix2[12]);
1524         s += abs(pix1[13] - pix2[13]);
1525         s += abs(pix1[14] - pix2[14]);
1526         s += abs(pix1[15] - pix2[15]);
1527         pix1 += line_size;
1528         pix2 += line_size;
1529     }
1530     return s;
1531 }
1532
1533 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1534 {
1535     int s, i;
1536
1537     s = 0;
1538     for(i=0;i<h;i++) {
1539         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1540         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1541         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1542         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1543         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1544         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1545         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1546         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1547         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1548         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1549         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1550         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1551         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1552         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1553         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1554         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1555         pix1 += line_size;
1556         pix2 += line_size;
1557     }
1558     return s;
1559 }
1560
1561 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1562 {
1563     int s, i;
1564     uint8_t *pix3 = pix2 + line_size;
1565
1566     s = 0;
1567     for(i=0;i<h;i++) {
1568         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1569         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1570         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1571         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1572         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1573         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1574         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1575         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1576         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1577         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1578         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1579         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1580         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1581         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1582         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1583         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1584         pix1 += line_size;
1585         pix2 += line_size;
1586         pix3 += line_size;
1587     }
1588     return s;
1589 }
1590
1591 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1592 {
1593     int s, i;
1594     uint8_t *pix3 = pix2 + line_size;
1595
1596     s = 0;
1597     for(i=0;i<h;i++) {
1598         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1599         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1600         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1601         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1602         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1603         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1604         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1605         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1606         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1607         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1608         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1609         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1610         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1611         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1612         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1613         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1614         pix1 += line_size;
1615         pix2 += line_size;
1616         pix3 += line_size;
1617     }
1618     return s;
1619 }
1620
1621 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1622 {
1623     int s, i;
1624
1625     s = 0;
1626     for(i=0;i<h;i++) {
1627         s += abs(pix1[0] - pix2[0]);
1628         s += abs(pix1[1] - pix2[1]);
1629         s += abs(pix1[2] - pix2[2]);
1630         s += abs(pix1[3] - pix2[3]);
1631         s += abs(pix1[4] - pix2[4]);
1632         s += abs(pix1[5] - pix2[5]);
1633         s += abs(pix1[6] - pix2[6]);
1634         s += abs(pix1[7] - pix2[7]);
1635         pix1 += line_size;
1636         pix2 += line_size;
1637     }
1638     return s;
1639 }
1640
1641 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1642 {
1643     int s, i;
1644
1645     s = 0;
1646     for(i=0;i<h;i++) {
1647         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1648         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1649         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1650         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1651         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1652         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1653         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1654         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1655         pix1 += line_size;
1656         pix2 += line_size;
1657     }
1658     return s;
1659 }
1660
1661 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1662 {
1663     int s, i;
1664     uint8_t *pix3 = pix2 + line_size;
1665
1666     s = 0;
1667     for(i=0;i<h;i++) {
1668         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1669         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1670         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1671         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1672         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1673         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1674         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1675         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1676         pix1 += line_size;
1677         pix2 += line_size;
1678         pix3 += line_size;
1679     }
1680     return s;
1681 }
1682
1683 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1684 {
1685     int s, i;
1686     uint8_t *pix3 = pix2 + line_size;
1687
1688     s = 0;
1689     for(i=0;i<h;i++) {
1690         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1691         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1692         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1693         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1694         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1695         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1696         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1697         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1698         pix1 += line_size;
1699         pix2 += line_size;
1700         pix3 += line_size;
1701     }
1702     return s;
1703 }
1704
1705 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1706     MpegEncContext *c = v;
1707     int score1=0;
1708     int score2=0;
1709     int x,y;
1710
1711     for(y=0; y<h; y++){
1712         for(x=0; x<16; x++){
1713             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1714         }
1715         if(y+1<h){
1716             for(x=0; x<15; x++){
1717                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1718                              - s1[x+1] + s1[x+1+stride])
1719                         -FFABS(  s2[x  ] - s2[x  +stride]
1720                              - s2[x+1] + s2[x+1+stride]);
1721             }
1722         }
1723         s1+= stride;
1724         s2+= stride;
1725     }
1726
1727     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1728     else  return score1 + FFABS(score2)*8;
1729 }
1730
1731 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1732     MpegEncContext *c = v;
1733     int score1=0;
1734     int score2=0;
1735     int x,y;
1736
1737     for(y=0; y<h; y++){
1738         for(x=0; x<8; x++){
1739             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1740         }
1741         if(y+1<h){
1742             for(x=0; x<7; x++){
1743                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1744                              - s1[x+1] + s1[x+1+stride])
1745                         -FFABS(  s2[x  ] - s2[x  +stride]
1746                              - s2[x+1] + s2[x+1+stride]);
1747             }
1748         }
1749         s1+= stride;
1750         s2+= stride;
1751     }
1752
1753     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1754     else  return score1 + FFABS(score2)*8;
1755 }
1756
1757 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1758     int i;
1759     unsigned int sum=0;
1760
1761     for(i=0; i<8*8; i++){
1762         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1763         int w= weight[i];
1764         b>>= RECON_SHIFT;
1765         assert(-512<b && b<512);
1766
1767         sum += (w*b)*(w*b)>>4;
1768     }
1769     return sum>>2;
1770 }
1771
1772 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1773     int i;
1774
1775     for(i=0; i<8*8; i++){
1776         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1777     }
1778 }
1779
1780 /**
1781  * Permute an 8x8 block.
1782  * @param block the block which will be permuted according to the given permutation vector
1783  * @param permutation the permutation vector
1784  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1785  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1786  *                  (inverse) permutated to scantable order!
1787  */
1788 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1789 {
1790     int i;
1791     DCTELEM temp[64];
1792
1793     if(last<=0) return;
1794     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1795
1796     for(i=0; i<=last; i++){
1797         const int j= scantable[i];
1798         temp[j]= block[j];
1799         block[j]=0;
1800     }
1801
1802     for(i=0; i<=last; i++){
1803         const int j= scantable[i];
1804         const int perm_j= permutation[j];
1805         block[perm_j]= temp[j];
1806     }
1807 }
1808
1809 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1810     return 0;
1811 }
1812
1813 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1814     int i;
1815
1816     memset(cmp, 0, sizeof(void*)*6);
1817
1818     for(i=0; i<6; i++){
1819         switch(type&0xFF){
1820         case FF_CMP_SAD:
1821             cmp[i]= c->sad[i];
1822             break;
1823         case FF_CMP_SATD:
1824             cmp[i]= c->hadamard8_diff[i];
1825             break;
1826         case FF_CMP_SSE:
1827             cmp[i]= c->sse[i];
1828             break;
1829         case FF_CMP_DCT:
1830             cmp[i]= c->dct_sad[i];
1831             break;
1832         case FF_CMP_DCT264:
1833             cmp[i]= c->dct264_sad[i];
1834             break;
1835         case FF_CMP_DCTMAX:
1836             cmp[i]= c->dct_max[i];
1837             break;
1838         case FF_CMP_PSNR:
1839             cmp[i]= c->quant_psnr[i];
1840             break;
1841         case FF_CMP_BIT:
1842             cmp[i]= c->bit[i];
1843             break;
1844         case FF_CMP_RD:
1845             cmp[i]= c->rd[i];
1846             break;
1847         case FF_CMP_VSAD:
1848             cmp[i]= c->vsad[i];
1849             break;
1850         case FF_CMP_VSSE:
1851             cmp[i]= c->vsse[i];
1852             break;
1853         case FF_CMP_ZERO:
1854             cmp[i]= zero_cmp;
1855             break;
1856         case FF_CMP_NSSE:
1857             cmp[i]= c->nsse[i];
1858             break;
1859 #if CONFIG_DWT
1860         case FF_CMP_W53:
1861             cmp[i]= c->w53[i];
1862             break;
1863         case FF_CMP_W97:
1864             cmp[i]= c->w97[i];
1865             break;
1866 #endif
1867         default:
1868             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1869         }
1870     }
1871 }
1872
1873 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1874     long i;
1875     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1876         long a = *(long*)(src+i);
1877         long b = *(long*)(dst+i);
1878         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1879     }
1880     for(; i<w; i++)
1881         dst[i+0] += src[i+0];
1882 }
1883
1884 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1885     long i;
1886 #if !HAVE_FAST_UNALIGNED
1887     if((long)src2 & (sizeof(long)-1)){
1888         for(i=0; i+7<w; i+=8){
1889             dst[i+0] = src1[i+0]-src2[i+0];
1890             dst[i+1] = src1[i+1]-src2[i+1];
1891             dst[i+2] = src1[i+2]-src2[i+2];
1892             dst[i+3] = src1[i+3]-src2[i+3];
1893             dst[i+4] = src1[i+4]-src2[i+4];
1894             dst[i+5] = src1[i+5]-src2[i+5];
1895             dst[i+6] = src1[i+6]-src2[i+6];
1896             dst[i+7] = src1[i+7]-src2[i+7];
1897         }
1898     }else
1899 #endif
1900     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1901         long a = *(long*)(src1+i);
1902         long b = *(long*)(src2+i);
1903         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1904     }
1905     for(; i<w; i++)
1906         dst[i+0] = src1[i+0]-src2[i+0];
1907 }
1908
1909 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1910     int i;
1911     uint8_t l, lt;
1912
1913     l= *left;
1914     lt= *left_top;
1915
1916     for(i=0; i<w; i++){
1917         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1918         lt= src1[i];
1919         dst[i]= l;
1920     }
1921
1922     *left= l;
1923     *left_top= lt;
1924 }
1925
1926 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1927     int i;
1928     uint8_t l, lt;
1929
1930     l= *left;
1931     lt= *left_top;
1932
1933     for(i=0; i<w; i++){
1934         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1935         lt= src1[i];
1936         l= src2[i];
1937         dst[i]= l - pred;
1938     }
1939
1940     *left= l;
1941     *left_top= lt;
1942 }
1943
1944 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1945     int i;
1946
1947     for(i=0; i<w-1; i++){
1948         acc+= src[i];
1949         dst[i]= acc;
1950         i++;
1951         acc+= src[i];
1952         dst[i]= acc;
1953     }
1954
1955     for(; i<w; i++){
1956         acc+= src[i];
1957         dst[i]= acc;
1958     }
1959
1960     return acc;
1961 }
1962
1963 #if HAVE_BIGENDIAN
1964 #define B 3
1965 #define G 2
1966 #define R 1
1967 #define A 0
1968 #else
1969 #define B 0
1970 #define G 1
1971 #define R 2
1972 #define A 3
1973 #endif
1974 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1975     int i;
1976     int r,g,b,a;
1977     r= *red;
1978     g= *green;
1979     b= *blue;
1980     a= *alpha;
1981
1982     for(i=0; i<w; i++){
1983         b+= src[4*i+B];
1984         g+= src[4*i+G];
1985         r+= src[4*i+R];
1986         a+= src[4*i+A];
1987
1988         dst[4*i+B]= b;
1989         dst[4*i+G]= g;
1990         dst[4*i+R]= r;
1991         dst[4*i+A]= a;
1992     }
1993
1994     *red= r;
1995     *green= g;
1996     *blue= b;
1997     *alpha= a;
1998 }
1999 #undef B
2000 #undef G
2001 #undef R
2002 #undef A
2003
2004 #define BUTTERFLY2(o1,o2,i1,i2) \
2005 o1= (i1)+(i2);\
2006 o2= (i1)-(i2);
2007
2008 #define BUTTERFLY1(x,y) \
2009 {\
2010     int a,b;\
2011     a= x;\
2012     b= y;\
2013     x= a+b;\
2014     y= a-b;\
2015 }
2016
2017 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2018
2019 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2020     int i;
2021     int temp[64];
2022     int sum=0;
2023
2024     assert(h==8);
2025
2026     for(i=0; i<8; i++){
2027         //FIXME try pointer walks
2028         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2029         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2030         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2031         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2032
2033         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2034         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2035         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2036         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2037
2038         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2039         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2040         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2041         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2042     }
2043
2044     for(i=0; i<8; i++){
2045         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2046         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2047         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2048         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2049
2050         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2051         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2052         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2053         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2054
2055         sum +=
2056              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2057             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2058             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2059             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2060     }
2061     return sum;
2062 }
2063
2064 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2065     int i;
2066     int temp[64];
2067     int sum=0;
2068
2069     assert(h==8);
2070
2071     for(i=0; i<8; i++){
2072         //FIXME try pointer walks
2073         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2074         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2075         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2076         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2077
2078         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2079         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2080         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2081         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2082
2083         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2084         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2085         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2086         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2087     }
2088
2089     for(i=0; i<8; i++){
2090         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2091         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2092         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2093         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2094
2095         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2096         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2097         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2098         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2099
2100         sum +=
2101              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2102             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2103             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2104             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2105     }
2106
2107     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2108
2109     return sum;
2110 }
2111
2112 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2113     MpegEncContext * const s= (MpegEncContext *)c;
2114     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2115
2116     assert(h==8);
2117
2118     s->dsp.diff_pixels(temp, src1, src2, stride);
2119     s->dsp.fdct(temp);
2120     return s->dsp.sum_abs_dctelem(temp);
2121 }
2122
2123 #if CONFIG_GPL
2124 #define DCT8_1D {\
2125     const int s07 = SRC(0) + SRC(7);\
2126     const int s16 = SRC(1) + SRC(6);\
2127     const int s25 = SRC(2) + SRC(5);\
2128     const int s34 = SRC(3) + SRC(4);\
2129     const int a0 = s07 + s34;\
2130     const int a1 = s16 + s25;\
2131     const int a2 = s07 - s34;\
2132     const int a3 = s16 - s25;\
2133     const int d07 = SRC(0) - SRC(7);\
2134     const int d16 = SRC(1) - SRC(6);\
2135     const int d25 = SRC(2) - SRC(5);\
2136     const int d34 = SRC(3) - SRC(4);\
2137     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2138     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2139     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2140     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2141     DST(0,  a0 + a1     ) ;\
2142     DST(1,  a4 + (a7>>2)) ;\
2143     DST(2,  a2 + (a3>>1)) ;\
2144     DST(3,  a5 + (a6>>2)) ;\
2145     DST(4,  a0 - a1     ) ;\
2146     DST(5,  a6 - (a5>>2)) ;\
2147     DST(6, (a2>>1) - a3 ) ;\
2148     DST(7, (a4>>2) - a7 ) ;\
2149 }
2150
2151 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2152     MpegEncContext * const s= (MpegEncContext *)c;
2153     DCTELEM dct[8][8];
2154     int i;
2155     int sum=0;
2156
2157     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2158
2159 #define SRC(x) dct[i][x]
2160 #define DST(x,v) dct[i][x]= v
2161     for( i = 0; i < 8; i++ )
2162         DCT8_1D
2163 #undef SRC
2164 #undef DST
2165
2166 #define SRC(x) dct[x][i]
2167 #define DST(x,v) sum += FFABS(v)
2168     for( i = 0; i < 8; i++ )
2169         DCT8_1D
2170 #undef SRC
2171 #undef DST
2172     return sum;
2173 }
2174 #endif
2175
2176 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2177     MpegEncContext * const s= (MpegEncContext *)c;
2178     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2179     int sum=0, i;
2180
2181     assert(h==8);
2182
2183     s->dsp.diff_pixels(temp, src1, src2, stride);
2184     s->dsp.fdct(temp);
2185
2186     for(i=0; i<64; i++)
2187         sum= FFMAX(sum, FFABS(temp[i]));
2188
2189     return sum;
2190 }
2191
2192 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2193     MpegEncContext * const s= (MpegEncContext *)c;
2194     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2195     DCTELEM * const bak = temp+64;
2196     int sum=0, i;
2197
2198     assert(h==8);
2199     s->mb_intra=0;
2200
2201     s->dsp.diff_pixels(temp, src1, src2, stride);
2202
2203     memcpy(bak, temp, 64*sizeof(DCTELEM));
2204
2205     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2206     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2207     ff_simple_idct_8(temp); //FIXME
2208
2209     for(i=0; i<64; i++)
2210         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2211
2212     return sum;
2213 }
2214
2215 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2216     MpegEncContext * const s= (MpegEncContext *)c;
2217     const uint8_t *scantable= s->intra_scantable.permutated;
2218     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2219     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2220     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2221     int i, last, run, bits, level, distortion, start_i;
2222     const int esc_length= s->ac_esc_length;
2223     uint8_t * length;
2224     uint8_t * last_length;
2225
2226     assert(h==8);
2227
2228     copy_block8(lsrc1, src1, 8, stride, 8);
2229     copy_block8(lsrc2, src2, 8, stride, 8);
2230
2231     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2232
2233     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2234
2235     bits=0;
2236
2237     if (s->mb_intra) {
2238         start_i = 1;
2239         length     = s->intra_ac_vlc_length;
2240         last_length= s->intra_ac_vlc_last_length;
2241         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2242     } else {
2243         start_i = 0;
2244         length     = s->inter_ac_vlc_length;
2245         last_length= s->inter_ac_vlc_last_length;
2246     }
2247
2248     if(last>=start_i){
2249         run=0;
2250         for(i=start_i; i<last; i++){
2251             int j= scantable[i];
2252             level= temp[j];
2253
2254             if(level){
2255                 level+=64;
2256                 if((level&(~127)) == 0){
2257                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2258                 }else
2259                     bits+= esc_length;
2260                 run=0;
2261             }else
2262                 run++;
2263         }
2264         i= scantable[last];
2265
2266         level= temp[i] + 64;
2267
2268         assert(level - 64);
2269
2270         if((level&(~127)) == 0){
2271             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2272         }else
2273             bits+= esc_length;
2274
2275     }
2276
2277     if(last>=0){
2278         if(s->mb_intra)
2279             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2280         else
2281             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2282     }
2283
2284     s->dsp.idct_add(lsrc2, 8, temp);
2285
2286     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2287
2288     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2289 }
2290
2291 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2292     MpegEncContext * const s= (MpegEncContext *)c;
2293     const uint8_t *scantable= s->intra_scantable.permutated;
2294     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2295     int i, last, run, bits, level, start_i;
2296     const int esc_length= s->ac_esc_length;
2297     uint8_t * length;
2298     uint8_t * last_length;
2299
2300     assert(h==8);
2301
2302     s->dsp.diff_pixels(temp, src1, src2, stride);
2303
2304     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2305
2306     bits=0;
2307
2308     if (s->mb_intra) {
2309         start_i = 1;
2310         length     = s->intra_ac_vlc_length;
2311         last_length= s->intra_ac_vlc_last_length;
2312         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2313     } else {
2314         start_i = 0;
2315         length     = s->inter_ac_vlc_length;
2316         last_length= s->inter_ac_vlc_last_length;
2317     }
2318
2319     if(last>=start_i){
2320         run=0;
2321         for(i=start_i; i<last; i++){
2322             int j= scantable[i];
2323             level= temp[j];
2324
2325             if(level){
2326                 level+=64;
2327                 if((level&(~127)) == 0){
2328                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2329                 }else
2330                     bits+= esc_length;
2331                 run=0;
2332             }else
2333                 run++;
2334         }
2335         i= scantable[last];
2336
2337         level= temp[i] + 64;
2338
2339         assert(level - 64);
2340
2341         if((level&(~127)) == 0){
2342             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2343         }else
2344             bits+= esc_length;
2345     }
2346
2347     return bits;
2348 }
2349
2350 #define VSAD_INTRA(size) \
2351 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2352     int score=0;                                                                                            \
2353     int x,y;                                                                                                \
2354                                                                                                             \
2355     for(y=1; y<h; y++){                                                                                     \
2356         for(x=0; x<size; x+=4){                                                                             \
2357             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2358                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2359         }                                                                                                   \
2360         s+= stride;                                                                                         \
2361     }                                                                                                       \
2362                                                                                                             \
2363     return score;                                                                                           \
2364 }
2365 VSAD_INTRA(8)
2366 VSAD_INTRA(16)
2367
2368 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2369     int score=0;
2370     int x,y;
2371
2372     for(y=1; y<h; y++){
2373         for(x=0; x<16; x++){
2374             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2375         }
2376         s1+= stride;
2377         s2+= stride;
2378     }
2379
2380     return score;
2381 }
2382
2383 #define SQ(a) ((a)*(a))
2384 #define VSSE_INTRA(size) \
2385 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2386     int score=0;                                                                                            \
2387     int x,y;                                                                                                \
2388                                                                                                             \
2389     for(y=1; y<h; y++){                                                                                     \
2390         for(x=0; x<size; x+=4){                                                                               \
2391             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2392                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2393         }                                                                                                   \
2394         s+= stride;                                                                                         \
2395     }                                                                                                       \
2396                                                                                                             \
2397     return score;                                                                                           \
2398 }
2399 VSSE_INTRA(8)
2400 VSSE_INTRA(16)
2401
2402 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2403     int score=0;
2404     int x,y;
2405
2406     for(y=1; y<h; y++){
2407         for(x=0; x<16; x++){
2408             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2409         }
2410         s1+= stride;
2411         s2+= stride;
2412     }
2413
2414     return score;
2415 }
2416
2417 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2418                                int size){
2419     int score=0;
2420     int i;
2421     for(i=0; i<size; i++)
2422         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2423     return score;
2424 }
2425
2426 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2427 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2428 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2429 #if CONFIG_GPL
2430 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2431 #endif
2432 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2433 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2434 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2435 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2436
2437 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2438     int i;
2439     for(i=0; i<len; i++)
2440         dst[i] = src0[i] * src1[i];
2441 }
2442
2443 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2444     int i;
2445     src1 += len-1;
2446     for(i=0; i<len; i++)
2447         dst[i] = src0[i] * src1[-i];
2448 }
2449
2450 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2451     int i;
2452     for(i=0; i<len; i++)
2453         dst[i] = src0[i] * src1[i] + src2[i];
2454 }
2455
2456 static void vector_fmul_window_c(float *dst, const float *src0,
2457                                  const float *src1, const float *win, int len)
2458 {
2459     int i,j;
2460     dst += len;
2461     win += len;
2462     src0+= len;
2463     for(i=-len, j=len-1; i<0; i++, j--) {
2464         float s0 = src0[i];
2465         float s1 = src1[j];
2466         float wi = win[i];
2467         float wj = win[j];
2468         dst[i] = s0*wj - s1*wi;
2469         dst[j] = s0*wi + s1*wj;
2470     }
2471 }
2472
2473 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2474                                  int len)
2475 {
2476     int i;
2477     for (i = 0; i < len; i++)
2478         dst[i] = src[i] * mul;
2479 }
2480
2481 static void vector_fmac_scalar_c(float *dst, const float *src, float mul,
2482                                  int len)
2483 {
2484     int i;
2485     for (i = 0; i < len; i++)
2486         dst[i] += src[i] * mul;
2487 }
2488
2489 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2490                                 int len)
2491 {
2492     int i;
2493     for (i = 0; i < len; i++) {
2494         float t = v1[i] - v2[i];
2495         v1[i] += v2[i];
2496         v2[i] = t;
2497     }
2498 }
2499
2500 static void butterflies_float_interleave_c(float *dst, const float *src0,
2501                                            const float *src1, int len)
2502 {
2503     int i;
2504     for (i = 0; i < len; i++) {
2505         float f1 = src0[i];
2506         float f2 = src1[i];
2507         dst[2*i    ] = f1 + f2;
2508         dst[2*i + 1] = f1 - f2;
2509     }
2510 }
2511
2512 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2513 {
2514     float p = 0.0;
2515     int i;
2516
2517     for (i = 0; i < len; i++)
2518         p += v1[i] * v2[i];
2519
2520     return p;
2521 }
2522
2523 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2524                    uint32_t maxi, uint32_t maxisign)
2525 {
2526
2527     if(a > mini) return mini;
2528     else if((a^(1U<<31)) > maxisign) return maxi;
2529     else return a;
2530 }
2531
2532 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2533     int i;
2534     uint32_t mini = *(uint32_t*)min;
2535     uint32_t maxi = *(uint32_t*)max;
2536     uint32_t maxisign = maxi ^ (1U<<31);
2537     uint32_t *dsti = (uint32_t*)dst;
2538     const uint32_t *srci = (const uint32_t*)src;
2539     for(i=0; i<len; i+=8) {
2540         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2541         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2542         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2543         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2544         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2545         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2546         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2547         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2548     }
2549 }
2550 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2551     int i;
2552     if(min < 0 && max > 0) {
2553         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2554     } else {
2555         for(i=0; i < len; i+=8) {
2556             dst[i    ] = av_clipf(src[i    ], min, max);
2557             dst[i + 1] = av_clipf(src[i + 1], min, max);
2558             dst[i + 2] = av_clipf(src[i + 2], min, max);
2559             dst[i + 3] = av_clipf(src[i + 3], min, max);
2560             dst[i + 4] = av_clipf(src[i + 4], min, max);
2561             dst[i + 5] = av_clipf(src[i + 5], min, max);
2562             dst[i + 6] = av_clipf(src[i + 6], min, max);
2563             dst[i + 7] = av_clipf(src[i + 7], min, max);
2564         }
2565     }
2566 }
2567
2568 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2569 {
2570     int res = 0;
2571
2572     while (order--)
2573         res += (*v1++ * *v2++) >> shift;
2574
2575     return res;
2576 }
2577
2578 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2579 {
2580     int res = 0;
2581     while (order--) {
2582         res   += *v1 * *v2++;
2583         *v1++ += mul * *v3++;
2584     }
2585     return res;
2586 }
2587
2588 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2589                                  const int16_t *window, unsigned int len)
2590 {
2591     int i;
2592     int len2 = len >> 1;
2593
2594     for (i = 0; i < len2; i++) {
2595         int16_t w       = window[i];
2596         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2597         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2598     }
2599 }
2600
2601 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2602                                 int32_t max, unsigned int len)
2603 {
2604     do {
2605         *dst++ = av_clip(*src++, min, max);
2606         *dst++ = av_clip(*src++, min, max);
2607         *dst++ = av_clip(*src++, min, max);
2608         *dst++ = av_clip(*src++, min, max);
2609         *dst++ = av_clip(*src++, min, max);
2610         *dst++ = av_clip(*src++, min, max);
2611         *dst++ = av_clip(*src++, min, max);
2612         *dst++ = av_clip(*src++, min, max);
2613         len -= 8;
2614     } while (len > 0);
2615 }
2616
2617 #define W0 2048
2618 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2619 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2620 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2621 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2622 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2623 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2624 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2625
2626 static void wmv2_idct_row(short * b)
2627 {
2628     int s1,s2;
2629     int a0,a1,a2,a3,a4,a5,a6,a7;
2630     /*step 1*/
2631     a1 = W1*b[1]+W7*b[7];
2632     a7 = W7*b[1]-W1*b[7];
2633     a5 = W5*b[5]+W3*b[3];
2634     a3 = W3*b[5]-W5*b[3];
2635     a2 = W2*b[2]+W6*b[6];
2636     a6 = W6*b[2]-W2*b[6];
2637     a0 = W0*b[0]+W0*b[4];
2638     a4 = W0*b[0]-W0*b[4];
2639     /*step 2*/
2640     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2641     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2642     /*step 3*/
2643     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2644     b[1] = (a4+a6 +s1   + (1<<7))>>8;
2645     b[2] = (a4-a6 +s2   + (1<<7))>>8;
2646     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2647     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2648     b[5] = (a4-a6 -s2   + (1<<7))>>8;
2649     b[6] = (a4+a6 -s1   + (1<<7))>>8;
2650     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2651 }
2652 static void wmv2_idct_col(short * b)
2653 {
2654     int s1,s2;
2655     int a0,a1,a2,a3,a4,a5,a6,a7;
2656     /*step 1, with extended precision*/
2657     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2658     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2659     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2660     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2661     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2662     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2663     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2664     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2665     /*step 2*/
2666     s1 = (181*(a1-a5+a7-a3)+128)>>8;
2667     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2668     /*step 3*/
2669     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2670     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2671     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2672     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2673
2674     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2675     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2676     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2677     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2678 }
2679 void ff_wmv2_idct_c(short * block){
2680     int i;
2681
2682     for(i=0;i<64;i+=8){
2683         wmv2_idct_row(block+i);
2684     }
2685     for(i=0;i<8;i++){
2686         wmv2_idct_col(block+i);
2687     }
2688 }
2689 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2690  converted */
2691 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2692 {
2693     ff_wmv2_idct_c(block);
2694     ff_put_pixels_clamped_c(block, dest, line_size);
2695 }
2696 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2697 {
2698     ff_wmv2_idct_c(block);
2699     ff_add_pixels_clamped_c(block, dest, line_size);
2700 }
2701 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2702 {
2703     ff_j_rev_dct (block);
2704     ff_put_pixels_clamped_c(block, dest, line_size);
2705 }
2706 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2707 {
2708     ff_j_rev_dct (block);
2709     ff_add_pixels_clamped_c(block, dest, line_size);
2710 }
2711
2712 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2713 {
2714     ff_j_rev_dct4 (block);
2715     put_pixels_clamped4_c(block, dest, line_size);
2716 }
2717 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2718 {
2719     ff_j_rev_dct4 (block);
2720     add_pixels_clamped4_c(block, dest, line_size);
2721 }
2722
2723 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2724 {
2725     ff_j_rev_dct2 (block);
2726     put_pixels_clamped2_c(block, dest, line_size);
2727 }
2728 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2729 {
2730     ff_j_rev_dct2 (block);
2731     add_pixels_clamped2_c(block, dest, line_size);
2732 }
2733
2734 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2735 {
2736     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2737
2738     dest[0] = cm[(block[0] + 4)>>3];
2739 }
2740 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2741 {
2742     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2743
2744     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2745 }
2746
2747 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2748
2749 /* init static data */
2750 av_cold void ff_dsputil_static_init(void)
2751 {
2752     int i;
2753
2754     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2755     for(i=0;i<MAX_NEG_CROP;i++) {
2756         ff_cropTbl[i] = 0;
2757         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2758     }
2759
2760     for(i=0;i<512;i++) {
2761         ff_squareTbl[i] = (i - 256) * (i - 256);
2762     }
2763
2764     for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2765 }
2766
2767 int ff_check_alignment(void){
2768     static int did_fail=0;
2769     LOCAL_ALIGNED_16(int, aligned, [4]);
2770
2771     if((intptr_t)aligned & 15){
2772         if(!did_fail){
2773 #if HAVE_MMX || HAVE_ALTIVEC
2774             av_log(NULL, AV_LOG_ERROR,
2775                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2776                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2777                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2778                 "Do not report crashes to Libav developers.\n");
2779 #endif
2780             did_fail=1;
2781         }
2782         return -1;
2783     }
2784     return 0;
2785 }
2786
2787 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2788 {
2789     int i;
2790
2791     ff_check_alignment();
2792
2793 #if CONFIG_ENCODERS
2794     if (avctx->bits_per_raw_sample == 10) {
2795         c->fdct    = ff_jpeg_fdct_islow_10;
2796         c->fdct248 = ff_fdct248_islow_10;
2797     } else {
2798         if(avctx->dct_algo==FF_DCT_FASTINT) {
2799             c->fdct    = ff_fdct_ifast;
2800             c->fdct248 = ff_fdct_ifast248;
2801         }
2802         else if(avctx->dct_algo==FF_DCT_FAAN) {
2803             c->fdct    = ff_faandct;
2804             c->fdct248 = ff_faandct248;
2805         }
2806         else {
2807             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2808             c->fdct248 = ff_fdct248_islow_8;
2809         }
2810     }
2811 #endif //CONFIG_ENCODERS
2812
2813     if(avctx->lowres==1){
2814         c->idct_put= ff_jref_idct4_put;
2815         c->idct_add= ff_jref_idct4_add;
2816         c->idct    = ff_j_rev_dct4;
2817         c->idct_permutation_type= FF_NO_IDCT_PERM;
2818     }else if(avctx->lowres==2){
2819         c->idct_put= ff_jref_idct2_put;
2820         c->idct_add= ff_jref_idct2_add;
2821         c->idct    = ff_j_rev_dct2;
2822         c->idct_permutation_type= FF_NO_IDCT_PERM;
2823     }else if(avctx->lowres==3){
2824         c->idct_put= ff_jref_idct1_put;
2825         c->idct_add= ff_jref_idct1_add;
2826         c->idct    = ff_j_rev_dct1;
2827         c->idct_permutation_type= FF_NO_IDCT_PERM;
2828     }else{
2829         if (avctx->bits_per_raw_sample == 10) {
2830             c->idct_put              = ff_simple_idct_put_10;
2831             c->idct_add              = ff_simple_idct_add_10;
2832             c->idct                  = ff_simple_idct_10;
2833             c->idct_permutation_type = FF_NO_IDCT_PERM;
2834         } else {
2835         if(avctx->idct_algo==FF_IDCT_INT){
2836             c->idct_put= ff_jref_idct_put;
2837             c->idct_add= ff_jref_idct_add;
2838             c->idct    = ff_j_rev_dct;
2839             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2840         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2841                 avctx->idct_algo==FF_IDCT_VP3){
2842             c->idct_put= ff_vp3_idct_put_c;
2843             c->idct_add= ff_vp3_idct_add_c;
2844             c->idct    = ff_vp3_idct_c;
2845             c->idct_permutation_type= FF_NO_IDCT_PERM;
2846         }else if(avctx->idct_algo==FF_IDCT_WMV2){
2847             c->idct_put= ff_wmv2_idct_put_c;
2848             c->idct_add= ff_wmv2_idct_add_c;
2849             c->idct    = ff_wmv2_idct_c;
2850             c->idct_permutation_type= FF_NO_IDCT_PERM;
2851         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2852             c->idct_put= ff_faanidct_put;
2853             c->idct_add= ff_faanidct_add;
2854             c->idct    = ff_faanidct;
2855             c->idct_permutation_type= FF_NO_IDCT_PERM;
2856         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2857             c->idct_put= ff_ea_idct_put_c;
2858             c->idct_permutation_type= FF_NO_IDCT_PERM;
2859         }else{ //accurate/default
2860             c->idct_put = ff_simple_idct_put_8;
2861             c->idct_add = ff_simple_idct_add_8;
2862             c->idct     = ff_simple_idct_8;
2863             c->idct_permutation_type= FF_NO_IDCT_PERM;
2864         }
2865         }
2866     }
2867
2868     c->diff_pixels = diff_pixels_c;
2869     c->put_pixels_clamped = ff_put_pixels_clamped_c;
2870     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2871     c->add_pixels_clamped = ff_add_pixels_clamped_c;
2872     c->sum_abs_dctelem = sum_abs_dctelem_c;
2873     c->gmc1 = gmc1_c;
2874     c->gmc = ff_gmc_c;
2875     c->pix_sum = pix_sum_c;
2876     c->pix_norm1 = pix_norm1_c;
2877
2878     c->fill_block_tab[0] = fill_block16_c;
2879     c->fill_block_tab[1] = fill_block8_c;
2880
2881     /* TODO [0] 16  [1] 8 */
2882     c->pix_abs[0][0] = pix_abs16_c;
2883     c->pix_abs[0][1] = pix_abs16_x2_c;
2884     c->pix_abs[0][2] = pix_abs16_y2_c;
2885     c->pix_abs[0][3] = pix_abs16_xy2_c;
2886     c->pix_abs[1][0] = pix_abs8_c;
2887     c->pix_abs[1][1] = pix_abs8_x2_c;
2888     c->pix_abs[1][2] = pix_abs8_y2_c;
2889     c->pix_abs[1][3] = pix_abs8_xy2_c;
2890
2891     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2892     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2893     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2894     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2895     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2896     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2897     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2898     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2899     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2900
2901     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2902     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2903     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2904     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2905     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2906     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2907     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2908     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2909     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2910
2911 #define dspfunc(PFX, IDX, NUM) \
2912     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2913     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2914     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2915     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2916     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2917     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2918     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2919     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2920     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2921     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2922     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2923     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2924     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2925     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2926     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2927     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2928
2929     dspfunc(put_qpel, 0, 16);
2930     dspfunc(put_no_rnd_qpel, 0, 16);
2931
2932     dspfunc(avg_qpel, 0, 16);
2933     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2934
2935     dspfunc(put_qpel, 1, 8);
2936     dspfunc(put_no_rnd_qpel, 1, 8);
2937
2938     dspfunc(avg_qpel, 1, 8);
2939     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2940
2941 #undef dspfunc
2942
2943 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2944     ff_mlp_init(c, avctx);
2945 #endif
2946 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2947     ff_intrax8dsp_init(c,avctx);
2948 #endif
2949
2950     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2951     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2952     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2953     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2954     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2955     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2956     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2957     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2958
2959 #define SET_CMP_FUNC(name) \
2960     c->name[0]= name ## 16_c;\
2961     c->name[1]= name ## 8x8_c;
2962
2963     SET_CMP_FUNC(hadamard8_diff)
2964     c->hadamard8_diff[4]= hadamard8_intra16_c;
2965     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2966     SET_CMP_FUNC(dct_sad)
2967     SET_CMP_FUNC(dct_max)
2968 #if CONFIG_GPL
2969     SET_CMP_FUNC(dct264_sad)
2970 #endif
2971     c->sad[0]= pix_abs16_c;
2972     c->sad[1]= pix_abs8_c;
2973     c->sse[0]= sse16_c;
2974     c->sse[1]= sse8_c;
2975     c->sse[2]= sse4_c;
2976     SET_CMP_FUNC(quant_psnr)
2977     SET_CMP_FUNC(rd)
2978     SET_CMP_FUNC(bit)
2979     c->vsad[0]= vsad16_c;
2980     c->vsad[4]= vsad_intra16_c;
2981     c->vsad[5]= vsad_intra8_c;
2982     c->vsse[0]= vsse16_c;
2983     c->vsse[4]= vsse_intra16_c;
2984     c->vsse[5]= vsse_intra8_c;
2985     c->nsse[0]= nsse16_c;
2986     c->nsse[1]= nsse8_c;
2987 #if CONFIG_DWT
2988     ff_dsputil_init_dwt(c);
2989 #endif
2990
2991     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2992
2993     c->add_bytes= add_bytes_c;
2994     c->diff_bytes= diff_bytes_c;
2995     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2996     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2997     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
2998     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2999     c->bswap_buf= bswap_buf;
3000     c->bswap16_buf = bswap16_buf;
3001
3002     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3003         c->h263_h_loop_filter= h263_h_loop_filter_c;
3004         c->h263_v_loop_filter= h263_v_loop_filter_c;
3005     }
3006
3007     if (CONFIG_VP3_DECODER) {
3008         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3009         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3010         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3011     }
3012
3013     c->h261_loop_filter= h261_loop_filter_c;
3014
3015     c->try_8x8basis= try_8x8basis_c;
3016     c->add_8x8basis= add_8x8basis_c;
3017
3018 #if CONFIG_VORBIS_DECODER
3019     c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling;
3020 #endif
3021 #if CONFIG_AC3_DECODER
3022     c->ac3_downmix = ff_ac3_downmix_c;
3023 #endif
3024     c->vector_fmul = vector_fmul_c;
3025     c->vector_fmul_reverse = vector_fmul_reverse_c;
3026     c->vector_fmul_add = vector_fmul_add_c;
3027     c->vector_fmul_window = vector_fmul_window_c;
3028     c->vector_clipf = vector_clipf_c;
3029     c->scalarproduct_int16 = scalarproduct_int16_c;
3030     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3031     c->apply_window_int16 = apply_window_int16_c;
3032     c->vector_clip_int32 = vector_clip_int32_c;
3033     c->scalarproduct_float = scalarproduct_float_c;
3034     c->butterflies_float = butterflies_float_c;
3035     c->butterflies_float_interleave = butterflies_float_interleave_c;
3036     c->vector_fmul_scalar = vector_fmul_scalar_c;
3037     c->vector_fmac_scalar = vector_fmac_scalar_c;
3038
3039     c->shrink[0]= av_image_copy_plane;
3040     c->shrink[1]= ff_shrink22;
3041     c->shrink[2]= ff_shrink44;
3042     c->shrink[3]= ff_shrink88;
3043
3044     c->prefetch= just_return;
3045
3046     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3047     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3048
3049 #undef FUNC
3050 #undef FUNCC
3051 #define FUNC(f, depth) f ## _ ## depth
3052 #define FUNCC(f, depth) f ## _ ## depth ## _c
3053
3054 #define dspfunc1(PFX, IDX, NUM, depth)\
3055     c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
3056     c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3057     c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3058     c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3059
3060 #define dspfunc2(PFX, IDX, NUM, depth)\
3061     c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3062     c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3063     c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3064     c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3065     c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3066     c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3067     c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3068     c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3069     c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3070     c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3071     c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3072     c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3073     c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3074     c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3075     c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3076     c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3077
3078
3079 #define BIT_DEPTH_FUNCS(depth, dct)\
3080     c->get_pixels                    = FUNCC(get_pixels   ## dct   , depth);\
3081     c->draw_edges                    = FUNCC(draw_edges            , depth);\
3082     c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
3083     c->clear_block                   = FUNCC(clear_block  ## dct   , depth);\
3084     c->clear_blocks                  = FUNCC(clear_blocks ## dct   , depth);\
3085     c->add_pixels8                   = FUNCC(add_pixels8  ## dct   , depth);\
3086     c->add_pixels4                   = FUNCC(add_pixels4  ## dct   , depth);\
3087     c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
3088     c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3089 \
3090     c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
3091     c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
3092     c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
3093     c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
3094     c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
3095     c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
3096 \
3097     dspfunc1(put       , 0, 16, depth);\
3098     dspfunc1(put       , 1,  8, depth);\
3099     dspfunc1(put       , 2,  4, depth);\
3100     dspfunc1(put       , 3,  2, depth);\
3101     dspfunc1(put_no_rnd, 0, 16, depth);\
3102     dspfunc1(put_no_rnd, 1,  8, depth);\
3103     dspfunc1(avg       , 0, 16, depth);\
3104     dspfunc1(avg       , 1,  8, depth);\
3105     dspfunc1(avg       , 2,  4, depth);\
3106     dspfunc1(avg       , 3,  2, depth);\
3107     dspfunc1(avg_no_rnd, 0, 16, depth);\
3108     dspfunc1(avg_no_rnd, 1,  8, depth);\
3109 \
3110     dspfunc2(put_h264_qpel, 0, 16, depth);\
3111     dspfunc2(put_h264_qpel, 1,  8, depth);\
3112     dspfunc2(put_h264_qpel, 2,  4, depth);\
3113     dspfunc2(put_h264_qpel, 3,  2, depth);\
3114     dspfunc2(avg_h264_qpel, 0, 16, depth);\
3115     dspfunc2(avg_h264_qpel, 1,  8, depth);\
3116     dspfunc2(avg_h264_qpel, 2,  4, depth);
3117
3118     switch (avctx->bits_per_raw_sample) {
3119     case 9:
3120         if (c->dct_bits == 32) {
3121             BIT_DEPTH_FUNCS(9, _32);
3122         } else {
3123             BIT_DEPTH_FUNCS(9, _16);
3124         }
3125         break;
3126     case 10:
3127         if (c->dct_bits == 32) {
3128             BIT_DEPTH_FUNCS(10, _32);
3129         } else {
3130             BIT_DEPTH_FUNCS(10, _16);
3131         }
3132         break;
3133     default:
3134         BIT_DEPTH_FUNCS(8, _16);
3135         break;
3136     }
3137
3138
3139     if (HAVE_MMX)        ff_dsputil_init_mmx   (c, avctx);
3140     if (ARCH_ARM)        ff_dsputil_init_arm   (c, avctx);
3141     if (HAVE_VIS)        ff_dsputil_init_vis   (c, avctx);
3142     if (ARCH_ALPHA)      ff_dsputil_init_alpha (c, avctx);
3143     if (ARCH_PPC)        ff_dsputil_init_ppc   (c, avctx);
3144     if (HAVE_MMI)        ff_dsputil_init_mmi   (c, avctx);
3145     if (ARCH_SH4)        ff_dsputil_init_sh4   (c, avctx);
3146     if (ARCH_BFIN)       ff_dsputil_init_bfin  (c, avctx);
3147
3148     for(i=0; i<64; i++){
3149         if(!c->put_2tap_qpel_pixels_tab[0][i])
3150             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3151         if(!c->avg_2tap_qpel_pixels_tab[0][i])
3152             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3153     }
3154
3155     ff_init_scantable_permutation(c->idct_permutation,
3156                                   c->idct_permutation_type);
3157 }