git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41 #include "png.h"
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 #define BIT_DEPTH 9
  47 #include "dsputil_template.c"
  48 #undef BIT_DEPTH
  49
  50 #define BIT_DEPTH 10
  51 #include "dsputil_template.c"
  52 #undef BIT_DEPTH
  53
  54 #define BIT_DEPTH 8
  55 #include "dsputil_template.c"
  56
  57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  58 #define pb_7f (~0UL/255 * 0x7f)
  59 #define pb_80 (~0UL/255 * 0x80)
  60
  61 const uint8_t ff_zigzag_direct[64] = {
  62     0,   1,  8, 16,  9,  2,  3, 10,
  63     17, 24, 32, 25, 18, 11,  4,  5,
  64     12, 19, 26, 33, 40, 48, 41, 34,
  65     27, 20, 13,  6,  7, 14, 21, 28,
  66     35, 42, 49, 56, 57, 50, 43, 36,
  67     29, 22, 15, 23, 30, 37, 44, 51,
  68     58, 59, 52, 45, 38, 31, 39, 46,
  69     53, 60, 61, 54, 47, 55, 62, 63
  70 };
  71
  72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  73    specification, we interleave the fields */
  74 const uint8_t ff_zigzag248_direct[64] = {
  75      0,  8,  1,  9, 16, 24,  2, 10,
  76     17, 25, 32, 40, 48, 56, 33, 41,
  77     18, 26,  3, 11,  4, 12, 19, 27,
  78     34, 42, 49, 57, 50, 58, 35, 43,
  79     20, 28,  5, 13,  6, 14, 21, 29,
  80     36, 44, 51, 59, 52, 60, 37, 45,
  81     22, 30,  7, 15, 23, 31, 38, 46,
  82     53, 61, 54, 62, 39, 47, 55, 63,
  83 };
  84
  85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  86 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  87
  88 const uint8_t ff_alternate_horizontal_scan[64] = {
  89     0,  1,   2,  3,  8,  9, 16, 17,
  90     10, 11,  4,  5,  6,  7, 15, 14,
  91     13, 12, 19, 18, 24, 25, 32, 33,
  92     26, 27, 20, 21, 22, 23, 28, 29,
  93     30, 31, 34, 35, 40, 41, 48, 49,
  94     42, 43, 36, 37, 38, 39, 44, 45,
  95     46, 47, 50, 51, 56, 57, 58, 59,
  96     52, 53, 54, 55, 60, 61, 62, 63,
  97 };
  98
  99 const uint8_t ff_alternate_vertical_scan[64] = {
 100     0,  8,  16, 24,  1,  9,  2, 10,
 101     17, 25, 32, 40, 48, 56, 57, 49,
 102     41, 33, 26, 18,  3, 11,  4, 12,
 103     19, 27, 34, 42, 50, 58, 35, 43,
 104     51, 59, 20, 28,  5, 13,  6, 14,
 105     21, 29, 36, 44, 52, 60, 37, 45,
 106     53, 61, 22, 30,  7, 15, 23, 31,
 107     38, 46, 54, 62, 39, 47, 55, 63,
 108 };
 109
 110 /* Input permutation for the simple_idct_mmx */
 111 static const uint8_t simple_mmx_permutation[64]={
 112         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 113         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 114         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 115         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 116         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 117         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 118         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 119         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 120 };
 121
 122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 123
 124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 125     int i;
 126     int end;
 127
 128     st->scantable= src_scantable;
 129
 130     for(i=0; i<64; i++){
 131         int j;
 132         j = src_scantable[i];
 133         st->permutated[i] = permutation[j];
 134 #if ARCH_PPC
 135         st->inverse[j] = i;
 136 #endif
 137     }
 138
 139     end=-1;
 140     for(i=0; i<64; i++){
 141         int j;
 142         j = st->permutated[i];
 143         if(j>end) end=j;
 144         st->raster_end[i]= end;
 145     }
 146 }
 147
 148 static int pix_sum_c(uint8_t * pix, int line_size)
 149 {
 150     int s, i, j;
 151
 152     s = 0;
 153     for (i = 0; i < 16; i++) {
 154         for (j = 0; j < 16; j += 8) {
 155             s += pix[0];
 156             s += pix[1];
 157             s += pix[2];
 158             s += pix[3];
 159             s += pix[4];
 160             s += pix[5];
 161             s += pix[6];
 162             s += pix[7];
 163             pix += 8;
 164         }
 165         pix += line_size - 16;
 166     }
 167     return s;
 168 }
 169
 170 static int pix_norm1_c(uint8_t * pix, int line_size)
 171 {
 172     int s, i, j;
 173     uint32_t *sq = ff_squareTbl + 256;
 174
 175     s = 0;
 176     for (i = 0; i < 16; i++) {
 177         for (j = 0; j < 16; j += 8) {
 178 #if 0
 179             s += sq[pix[0]];
 180             s += sq[pix[1]];
 181             s += sq[pix[2]];
 182             s += sq[pix[3]];
 183             s += sq[pix[4]];
 184             s += sq[pix[5]];
 185             s += sq[pix[6]];
 186             s += sq[pix[7]];
 187 #else
 188 #if HAVE_FAST_64BIT
 189             register uint64_t x=*(uint64_t*)pix;
 190             s += sq[x&0xff];
 191             s += sq[(x>>8)&0xff];
 192             s += sq[(x>>16)&0xff];
 193             s += sq[(x>>24)&0xff];
 194             s += sq[(x>>32)&0xff];
 195             s += sq[(x>>40)&0xff];
 196             s += sq[(x>>48)&0xff];
 197             s += sq[(x>>56)&0xff];
 198 #else
 199             register uint32_t x=*(uint32_t*)pix;
 200             s += sq[x&0xff];
 201             s += sq[(x>>8)&0xff];
 202             s += sq[(x>>16)&0xff];
 203             s += sq[(x>>24)&0xff];
 204             x=*(uint32_t*)(pix+4);
 205             s += sq[x&0xff];
 206             s += sq[(x>>8)&0xff];
 207             s += sq[(x>>16)&0xff];
 208             s += sq[(x>>24)&0xff];
 209 #endif
 210 #endif
 211             pix += 8;
 212         }
 213         pix += line_size - 16;
 214     }
 215     return s;
 216 }
 217
 218 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 219     int i;
 220
 221     for(i=0; i+8<=w; i+=8){
 222         dst[i+0]= av_bswap32(src[i+0]);
 223         dst[i+1]= av_bswap32(src[i+1]);
 224         dst[i+2]= av_bswap32(src[i+2]);
 225         dst[i+3]= av_bswap32(src[i+3]);
 226         dst[i+4]= av_bswap32(src[i+4]);
 227         dst[i+5]= av_bswap32(src[i+5]);
 228         dst[i+6]= av_bswap32(src[i+6]);
 229         dst[i+7]= av_bswap32(src[i+7]);
 230     }
 231     for(;i<w; i++){
 232         dst[i+0]= av_bswap32(src[i+0]);
 233     }
 234 }
 235
 236 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 237 {
 238     while (len--)
 239         *dst++ = av_bswap16(*src++);
 240 }
 241
 242 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 243 {
 244     int s, i;
 245     uint32_t *sq = ff_squareTbl + 256;
 246
 247     s = 0;
 248     for (i = 0; i < h; i++) {
 249         s += sq[pix1[0] - pix2[0]];
 250         s += sq[pix1[1] - pix2[1]];
 251         s += sq[pix1[2] - pix2[2]];
 252         s += sq[pix1[3] - pix2[3]];
 253         pix1 += line_size;
 254         pix2 += line_size;
 255     }
 256     return s;
 257 }
 258
 259 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 260 {
 261     int s, i;
 262     uint32_t *sq = ff_squareTbl + 256;
 263
 264     s = 0;
 265     for (i = 0; i < h; i++) {
 266         s += sq[pix1[0] - pix2[0]];
 267         s += sq[pix1[1] - pix2[1]];
 268         s += sq[pix1[2] - pix2[2]];
 269         s += sq[pix1[3] - pix2[3]];
 270         s += sq[pix1[4] - pix2[4]];
 271         s += sq[pix1[5] - pix2[5]];
 272         s += sq[pix1[6] - pix2[6]];
 273         s += sq[pix1[7] - pix2[7]];
 274         pix1 += line_size;
 275         pix2 += line_size;
 276     }
 277     return s;
 278 }
 279
 280 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 281 {
 282     int s, i;
 283     uint32_t *sq = ff_squareTbl + 256;
 284
 285     s = 0;
 286     for (i = 0; i < h; i++) {
 287         s += sq[pix1[ 0] - pix2[ 0]];
 288         s += sq[pix1[ 1] - pix2[ 1]];
 289         s += sq[pix1[ 2] - pix2[ 2]];
 290         s += sq[pix1[ 3] - pix2[ 3]];
 291         s += sq[pix1[ 4] - pix2[ 4]];
 292         s += sq[pix1[ 5] - pix2[ 5]];
 293         s += sq[pix1[ 6] - pix2[ 6]];
 294         s += sq[pix1[ 7] - pix2[ 7]];
 295         s += sq[pix1[ 8] - pix2[ 8]];
 296         s += sq[pix1[ 9] - pix2[ 9]];
 297         s += sq[pix1[10] - pix2[10]];
 298         s += sq[pix1[11] - pix2[11]];
 299         s += sq[pix1[12] - pix2[12]];
 300         s += sq[pix1[13] - pix2[13]];
 301         s += sq[pix1[14] - pix2[14]];
 302         s += sq[pix1[15] - pix2[15]];
 303
 304         pix1 += line_size;
 305         pix2 += line_size;
 306     }
 307     return s;
 308 }
 309
 310 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 311                           const uint8_t *s2, int stride){
 312     int i;
 313
 314     /* read the pixels */
 315     for(i=0;i<8;i++) {
 316         block[0] = s1[0] - s2[0];
 317         block[1] = s1[1] - s2[1];
 318         block[2] = s1[2] - s2[2];
 319         block[3] = s1[3] - s2[3];
 320         block[4] = s1[4] - s2[4];
 321         block[5] = s1[5] - s2[5];
 322         block[6] = s1[6] - s2[6];
 323         block[7] = s1[7] - s2[7];
 324         s1 += stride;
 325         s2 += stride;
 326         block += 8;
 327     }
 328 }
 329
 330
 331 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 332                              int line_size)
 333 {
 334     int i;
 335     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 336
 337     /* read the pixels */
 338     for(i=0;i<8;i++) {
 339         pixels[0] = cm[block[0]];
 340         pixels[1] = cm[block[1]];
 341         pixels[2] = cm[block[2]];
 342         pixels[3] = cm[block[3]];
 343         pixels[4] = cm[block[4]];
 344         pixels[5] = cm[block[5]];
 345         pixels[6] = cm[block[6]];
 346         pixels[7] = cm[block[7]];
 347
 348         pixels += line_size;
 349         block += 8;
 350     }
 351 }
 352
 353 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 354                                  int line_size)
 355 {
 356     int i;
 357     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 358
 359     /* read the pixels */
 360     for(i=0;i<4;i++) {
 361         pixels[0] = cm[block[0]];
 362         pixels[1] = cm[block[1]];
 363         pixels[2] = cm[block[2]];
 364         pixels[3] = cm[block[3]];
 365
 366         pixels += line_size;
 367         block += 8;
 368     }
 369 }
 370
 371 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 372                                  int line_size)
 373 {
 374     int i;
 375     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 376
 377     /* read the pixels */
 378     for(i=0;i<2;i++) {
 379         pixels[0] = cm[block[0]];
 380         pixels[1] = cm[block[1]];
 381
 382         pixels += line_size;
 383         block += 8;
 384     }
 385 }
 386
 387 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
 388                                     uint8_t *restrict pixels,
 389                                     int line_size)
 390 {
 391     int i, j;
 392
 393     for (i = 0; i < 8; i++) {
 394         for (j = 0; j < 8; j++) {
 395             if (*block < -128)
 396                 *pixels = 0;
 397             else if (*block > 127)
 398                 *pixels = 255;
 399             else
 400                 *pixels = (uint8_t)(*block + 128);
 401             block++;
 402             pixels++;
 403         }
 404         pixels += (line_size - 8);
 405     }
 406 }
 407
 408 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 409                              int line_size)
 410 {
 411     int i;
 412     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 413
 414     /* read the pixels */
 415     for(i=0;i<8;i++) {
 416         pixels[0] = cm[pixels[0] + block[0]];
 417         pixels[1] = cm[pixels[1] + block[1]];
 418         pixels[2] = cm[pixels[2] + block[2]];
 419         pixels[3] = cm[pixels[3] + block[3]];
 420         pixels[4] = cm[pixels[4] + block[4]];
 421         pixels[5] = cm[pixels[5] + block[5]];
 422         pixels[6] = cm[pixels[6] + block[6]];
 423         pixels[7] = cm[pixels[7] + block[7]];
 424         pixels += line_size;
 425         block += 8;
 426     }
 427 }
 428
 429 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 430                           int line_size)
 431 {
 432     int i;
 433     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 434
 435     /* read the pixels */
 436     for(i=0;i<4;i++) {
 437         pixels[0] = cm[pixels[0] + block[0]];
 438         pixels[1] = cm[pixels[1] + block[1]];
 439         pixels[2] = cm[pixels[2] + block[2]];
 440         pixels[3] = cm[pixels[3] + block[3]];
 441         pixels += line_size;
 442         block += 8;
 443     }
 444 }
 445
 446 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 447                           int line_size)
 448 {
 449     int i;
 450     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 451
 452     /* read the pixels */
 453     for(i=0;i<2;i++) {
 454         pixels[0] = cm[pixels[0] + block[0]];
 455         pixels[1] = cm[pixels[1] + block[1]];
 456         pixels += line_size;
 457         block += 8;
 458     }
 459 }
 460
 461 static int sum_abs_dctelem_c(DCTELEM *block)
 462 {
 463     int sum=0, i;
 464     for(i=0; i<64; i++)
 465         sum+= FFABS(block[i]);
 466     return sum;
 467 }
 468
 469 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 470 {
 471     int i;
 472
 473     for (i = 0; i < h; i++) {
 474         memset(block, value, 16);
 475         block += line_size;
 476     }
 477 }
 478
 479 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 480 {
 481     int i;
 482
 483     for (i = 0; i < h; i++) {
 484         memset(block, value, 8);
 485         block += line_size;
 486     }
 487 }
 488
 489 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
 490 {
 491     int i, j;
 492     uint16_t *dst1 = (uint16_t *) dst;
 493     uint16_t *dst2 = (uint16_t *)(dst + linesize);
 494
 495     for (j = 0; j < 8; j++) {
 496         for (i = 0; i < 8; i++) {
 497             dst1[i] = dst2[i] = src[i] * 0x0101;
 498         }
 499         src  += 8;
 500         dst1 += linesize;
 501         dst2 += linesize;
 502     }
 503 }
 504
 505 #define avg2(a,b) ((a+b+1)>>1)
 506 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 507
 508 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 509 {
 510     const int A=(16-x16)*(16-y16);
 511     const int B=(   x16)*(16-y16);
 512     const int C=(16-x16)*(   y16);
 513     const int D=(   x16)*(   y16);
 514     int i;
 515
 516     for(i=0; i<h; i++)
 517     {
 518         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 519         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 520         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 521         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 522         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 523         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 524         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 525         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 526         dst+= stride;
 527         src+= stride;
 528     }
 529 }
 530
 531 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 532                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 533 {
 534     int y, vx, vy;
 535     const int s= 1<<shift;
 536
 537     width--;
 538     height--;
 539
 540     for(y=0; y<h; y++){
 541         int x;
 542
 543         vx= ox;
 544         vy= oy;
 545         for(x=0; x<8; x++){ //XXX FIXME optimize
 546             int src_x, src_y, frac_x, frac_y, index;
 547
 548             src_x= vx>>16;
 549             src_y= vy>>16;
 550             frac_x= src_x&(s-1);
 551             frac_y= src_y&(s-1);
 552             src_x>>=shift;
 553             src_y>>=shift;
 554
 555             if((unsigned)src_x < width){
 556                 if((unsigned)src_y < height){
 557                     index= src_x + src_y*stride;
 558                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 559                                            + src[index       +1]*   frac_x )*(s-frac_y)
 560                                         + (  src[index+stride  ]*(s-frac_x)
 561                                            + src[index+stride+1]*   frac_x )*   frac_y
 562                                         + r)>>(shift*2);
 563                 }else{
 564                     index= src_x + av_clip(src_y, 0, height)*stride;
 565                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 566                                           + src[index       +1]*   frac_x )*s
 567                                         + r)>>(shift*2);
 568                 }
 569             }else{
 570                 if((unsigned)src_y < height){
 571                     index= av_clip(src_x, 0, width) + src_y*stride;
 572                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 573                                            + src[index+stride  ]*   frac_y )*s
 574                                         + r)>>(shift*2);
 575                 }else{
 576                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 577                     dst[y*stride + x]=    src[index         ];
 578                 }
 579             }
 580
 581             vx+= dxx;
 582             vy+= dyx;
 583         }
 584         ox += dxy;
 585         oy += dyy;
 586     }
 587 }
 588
 589 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 590     switch(width){
 591     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 592     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 593     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 594     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 595     }
 596 }
 597
 598 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 599     int i,j;
 600     for (i=0; i < height; i++) {
 601       for (j=0; j < width; j++) {
 602         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 603       }
 604       src += stride;
 605       dst += stride;
 606     }
 607 }
 608
 609 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 610     int i,j;
 611     for (i=0; i < height; i++) {
 612       for (j=0; j < width; j++) {
 613         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 614       }
 615       src += stride;
 616       dst += stride;
 617     }
 618 }
 619
 620 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 621     int i,j;
 622     for (i=0; i < height; i++) {
 623       for (j=0; j < width; j++) {
 624         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 625       }
 626       src += stride;
 627       dst += stride;
 628     }
 629 }
 630
 631 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 632     int i,j;
 633     for (i=0; i < height; i++) {
 634       for (j=0; j < width; j++) {
 635         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 636       }
 637       src += stride;
 638       dst += stride;
 639     }
 640 }
 641
 642 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 643     int i,j;
 644     for (i=0; i < height; i++) {
 645       for (j=0; j < width; j++) {
 646         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 647       }
 648       src += stride;
 649       dst += stride;
 650     }
 651 }
 652
 653 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 654     int i,j;
 655     for (i=0; i < height; i++) {
 656       for (j=0; j < width; j++) {
 657         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 658       }
 659       src += stride;
 660       dst += stride;
 661     }
 662 }
 663
 664 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 665     int i,j;
 666     for (i=0; i < height; i++) {
 667       for (j=0; j < width; j++) {
 668         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 669       }
 670       src += stride;
 671       dst += stride;
 672     }
 673 }
 674
 675 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 676     int i,j;
 677     for (i=0; i < height; i++) {
 678       for (j=0; j < width; j++) {
 679         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 680       }
 681       src += stride;
 682       dst += stride;
 683     }
 684 }
 685
 686 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 687     switch(width){
 688     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 689     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 690     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 691     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 692     }
 693 }
 694
 695 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 696     int i,j;
 697     for (i=0; i < height; i++) {
 698       for (j=0; j < width; j++) {
 699         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 700       }
 701       src += stride;
 702       dst += stride;
 703     }
 704 }
 705
 706 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 707     int i,j;
 708     for (i=0; i < height; i++) {
 709       for (j=0; j < width; j++) {
 710         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 711       }
 712       src += stride;
 713       dst += stride;
 714     }
 715 }
 716
 717 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 718     int i,j;
 719     for (i=0; i < height; i++) {
 720       for (j=0; j < width; j++) {
 721         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 722       }
 723       src += stride;
 724       dst += stride;
 725     }
 726 }
 727
 728 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 729     int i,j;
 730     for (i=0; i < height; i++) {
 731       for (j=0; j < width; j++) {
 732         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 733       }
 734       src += stride;
 735       dst += stride;
 736     }
 737 }
 738
 739 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 740     int i,j;
 741     for (i=0; i < height; i++) {
 742       for (j=0; j < width; j++) {
 743         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 744       }
 745       src += stride;
 746       dst += stride;
 747     }
 748 }
 749
 750 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 751     int i,j;
 752     for (i=0; i < height; i++) {
 753       for (j=0; j < width; j++) {
 754         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 755       }
 756       src += stride;
 757       dst += stride;
 758     }
 759 }
 760
 761 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 762     int i,j;
 763     for (i=0; i < height; i++) {
 764       for (j=0; j < width; j++) {
 765         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 766       }
 767       src += stride;
 768       dst += stride;
 769     }
 770 }
 771
 772 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 773     int i,j;
 774     for (i=0; i < height; i++) {
 775       for (j=0; j < width; j++) {
 776         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 777       }
 778       src += stride;
 779       dst += stride;
 780     }
 781 }
 782
 783 #define QPEL_MC(r, OPNAME, RND, OP) \
 784 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 785     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 786     int i;\
 787     for(i=0; i<h; i++)\
 788     {\
 789         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 790         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 791         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 792         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 793         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 794         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 795         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 796         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 797         dst+=dstStride;\
 798         src+=srcStride;\
 799     }\
 800 }\
 801 \
 802 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 803     const int w=8;\
 804     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 805     int i;\
 806     for(i=0; i<w; i++)\
 807     {\
 808         const int src0= src[0*srcStride];\
 809         const int src1= src[1*srcStride];\
 810         const int src2= src[2*srcStride];\
 811         const int src3= src[3*srcStride];\
 812         const int src4= src[4*srcStride];\
 813         const int src5= src[5*srcStride];\
 814         const int src6= src[6*srcStride];\
 815         const int src7= src[7*srcStride];\
 816         const int src8= src[8*srcStride];\
 817         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 818         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 819         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 820         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 821         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 822         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 823         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 824         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 825         dst++;\
 826         src++;\
 827     }\
 828 }\
 829 \
 830 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 831     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 832     int i;\
 833     \
 834     for(i=0; i<h; i++)\
 835     {\
 836         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 837         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 838         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 839         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 840         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 841         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 842         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 843         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 844         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 845         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 846         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 847         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 848         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 849         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 850         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 851         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 852         dst+=dstStride;\
 853         src+=srcStride;\
 854     }\
 855 }\
 856 \
 857 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 858     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 859     int i;\
 860     const int w=16;\
 861     for(i=0; i<w; i++)\
 862     {\
 863         const int src0= src[0*srcStride];\
 864         const int src1= src[1*srcStride];\
 865         const int src2= src[2*srcStride];\
 866         const int src3= src[3*srcStride];\
 867         const int src4= src[4*srcStride];\
 868         const int src5= src[5*srcStride];\
 869         const int src6= src[6*srcStride];\
 870         const int src7= src[7*srcStride];\
 871         const int src8= src[8*srcStride];\
 872         const int src9= src[9*srcStride];\
 873         const int src10= src[10*srcStride];\
 874         const int src11= src[11*srcStride];\
 875         const int src12= src[12*srcStride];\
 876         const int src13= src[13*srcStride];\
 877         const int src14= src[14*srcStride];\
 878         const int src15= src[15*srcStride];\
 879         const int src16= src[16*srcStride];\
 880         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 881         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 882         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 883         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 884         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 885         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 886         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 887         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 888         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 889         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 890         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 891         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 892         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 893         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 894         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 895         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 896         dst++;\
 897         src++;\
 898     }\
 899 }\
 900 \
 901 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
 902     uint8_t half[64];\
 903     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 904     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 905 }\
 906 \
 907 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
 908     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 909 }\
 910 \
 911 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
 912     uint8_t half[64];\
 913     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 914     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 915 }\
 916 \
 917 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
 918     uint8_t full[16*9];\
 919     uint8_t half[64];\
 920     copy_block9(full, src, 16, stride, 9);\
 921     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 922     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 923 }\
 924 \
 925 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
 926     uint8_t full[16*9];\
 927     copy_block9(full, src, 16, stride, 9);\
 928     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 929 }\
 930 \
 931 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
 932     uint8_t full[16*9];\
 933     uint8_t half[64];\
 934     copy_block9(full, src, 16, stride, 9);\
 935     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 936     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 937 }\
 938 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
 939     uint8_t full[16*9];\
 940     uint8_t halfH[72];\
 941     uint8_t halfV[64];\
 942     uint8_t halfHV[64];\
 943     copy_block9(full, src, 16, stride, 9);\
 944     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 945     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 946     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 947     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 948 }\
 949 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
 950     uint8_t full[16*9];\
 951     uint8_t halfH[72];\
 952     uint8_t halfHV[64];\
 953     copy_block9(full, src, 16, stride, 9);\
 954     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 955     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 956     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 957     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 958 }\
 959 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
 960     uint8_t full[16*9];\
 961     uint8_t halfH[72];\
 962     uint8_t halfV[64];\
 963     uint8_t halfHV[64];\
 964     copy_block9(full, src, 16, stride, 9);\
 965     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 966     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 967     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 968     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 969 }\
 970 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
 971     uint8_t full[16*9];\
 972     uint8_t halfH[72];\
 973     uint8_t halfHV[64];\
 974     copy_block9(full, src, 16, stride, 9);\
 975     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 976     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 977     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 978     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 979 }\
 980 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
 981     uint8_t full[16*9];\
 982     uint8_t halfH[72];\
 983     uint8_t halfV[64];\
 984     uint8_t halfHV[64];\
 985     copy_block9(full, src, 16, stride, 9);\
 986     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 987     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 988     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 989     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 990 }\
 991 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
 992     uint8_t full[16*9];\
 993     uint8_t halfH[72];\
 994     uint8_t halfHV[64];\
 995     copy_block9(full, src, 16, stride, 9);\
 996     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 997     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 998     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 999     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1000 }\
1001 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1002     uint8_t full[16*9];\
1003     uint8_t halfH[72];\
1004     uint8_t halfV[64];\
1005     uint8_t halfHV[64];\
1006     copy_block9(full, src, 16, stride, 9);\
1007     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1008     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1009     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1010     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1011 }\
1012 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1013     uint8_t full[16*9];\
1014     uint8_t halfH[72];\
1015     uint8_t halfHV[64];\
1016     copy_block9(full, src, 16, stride, 9);\
1017     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1018     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1019     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1020     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1021 }\
1022 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1023     uint8_t halfH[72];\
1024     uint8_t halfHV[64];\
1025     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1026     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1027     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1028 }\
1029 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1030     uint8_t halfH[72];\
1031     uint8_t halfHV[64];\
1032     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1033     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1034     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1035 }\
1036 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1037     uint8_t full[16*9];\
1038     uint8_t halfH[72];\
1039     uint8_t halfV[64];\
1040     uint8_t halfHV[64];\
1041     copy_block9(full, src, 16, stride, 9);\
1042     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1043     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1044     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1045     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1046 }\
1047 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1048     uint8_t full[16*9];\
1049     uint8_t halfH[72];\
1050     copy_block9(full, src, 16, stride, 9);\
1051     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1052     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1053     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1054 }\
1055 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1056     uint8_t full[16*9];\
1057     uint8_t halfH[72];\
1058     uint8_t halfV[64];\
1059     uint8_t halfHV[64];\
1060     copy_block9(full, src, 16, stride, 9);\
1061     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1062     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1063     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1064     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1065 }\
1066 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1067     uint8_t full[16*9];\
1068     uint8_t halfH[72];\
1069     copy_block9(full, src, 16, stride, 9);\
1070     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1071     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1072     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1073 }\
1074 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1075     uint8_t halfH[72];\
1076     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1077     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1078 }\
1079 \
1080 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1081     uint8_t half[256];\
1082     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1083     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1084 }\
1085 \
1086 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1087     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1088 }\
1089 \
1090 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1091     uint8_t half[256];\
1092     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1093     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1094 }\
1095 \
1096 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1097     uint8_t full[24*17];\
1098     uint8_t half[256];\
1099     copy_block17(full, src, 24, stride, 17);\
1100     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1101     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1102 }\
1103 \
1104 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1105     uint8_t full[24*17];\
1106     copy_block17(full, src, 24, stride, 17);\
1107     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1108 }\
1109 \
1110 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1111     uint8_t full[24*17];\
1112     uint8_t half[256];\
1113     copy_block17(full, src, 24, stride, 17);\
1114     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1115     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1116 }\
1117 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1118     uint8_t full[24*17];\
1119     uint8_t halfH[272];\
1120     uint8_t halfV[256];\
1121     uint8_t halfHV[256];\
1122     copy_block17(full, src, 24, stride, 17);\
1123     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1124     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1125     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1126     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1127 }\
1128 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1129     uint8_t full[24*17];\
1130     uint8_t halfH[272];\
1131     uint8_t halfHV[256];\
1132     copy_block17(full, src, 24, stride, 17);\
1133     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1134     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1135     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1136     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1137 }\
1138 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1139     uint8_t full[24*17];\
1140     uint8_t halfH[272];\
1141     uint8_t halfV[256];\
1142     uint8_t halfHV[256];\
1143     copy_block17(full, src, 24, stride, 17);\
1144     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1145     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1146     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1147     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1148 }\
1149 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1150     uint8_t full[24*17];\
1151     uint8_t halfH[272];\
1152     uint8_t halfHV[256];\
1153     copy_block17(full, src, 24, stride, 17);\
1154     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1155     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1156     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1157     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1158 }\
1159 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1160     uint8_t full[24*17];\
1161     uint8_t halfH[272];\
1162     uint8_t halfV[256];\
1163     uint8_t halfHV[256];\
1164     copy_block17(full, src, 24, stride, 17);\
1165     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1166     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1167     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1168     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1169 }\
1170 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1171     uint8_t full[24*17];\
1172     uint8_t halfH[272];\
1173     uint8_t halfHV[256];\
1174     copy_block17(full, src, 24, stride, 17);\
1175     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1176     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1177     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1178     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1179 }\
1180 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1181     uint8_t full[24*17];\
1182     uint8_t halfH[272];\
1183     uint8_t halfV[256];\
1184     uint8_t halfHV[256];\
1185     copy_block17(full, src, 24, stride, 17);\
1186     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1187     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1188     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1189     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1190 }\
1191 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1192     uint8_t full[24*17];\
1193     uint8_t halfH[272];\
1194     uint8_t halfHV[256];\
1195     copy_block17(full, src, 24, stride, 17);\
1196     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1197     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1198     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1199     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1200 }\
1201 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1202     uint8_t halfH[272];\
1203     uint8_t halfHV[256];\
1204     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1205     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1206     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1207 }\
1208 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1209     uint8_t halfH[272];\
1210     uint8_t halfHV[256];\
1211     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1212     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1213     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1214 }\
1215 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1216     uint8_t full[24*17];\
1217     uint8_t halfH[272];\
1218     uint8_t halfV[256];\
1219     uint8_t halfHV[256];\
1220     copy_block17(full, src, 24, stride, 17);\
1221     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1222     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1223     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1224     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1225 }\
1226 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1227     uint8_t full[24*17];\
1228     uint8_t halfH[272];\
1229     copy_block17(full, src, 24, stride, 17);\
1230     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1231     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1232     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1233 }\
1234 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1235     uint8_t full[24*17];\
1236     uint8_t halfH[272];\
1237     uint8_t halfV[256];\
1238     uint8_t halfHV[256];\
1239     copy_block17(full, src, 24, stride, 17);\
1240     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1241     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1242     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1243     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1244 }\
1245 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1246     uint8_t full[24*17];\
1247     uint8_t halfH[272];\
1248     copy_block17(full, src, 24, stride, 17);\
1249     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1250     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1251     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1252 }\
1253 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1254     uint8_t halfH[272];\
1255     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1256     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1257 }
1258
1259 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1260 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1261 #define op_put(a, b) a = cm[((b) + 16)>>5]
1262 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1263
1264 QPEL_MC(0, put_       , _       , op_put)
1265 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1266 QPEL_MC(0, avg_       , _       , op_avg)
1267 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1268 #undef op_avg
1269 #undef op_avg_no_rnd
1270 #undef op_put
1271 #undef op_put_no_rnd
1272
1273 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1274 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1275 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1276 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1277 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1278 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1279
1280 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1281     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1282     int i;
1283
1284     for(i=0; i<h; i++){
1285         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1286         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1287         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1288         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1289         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1290         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1291         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1292         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1293         dst+=dstStride;
1294         src+=srcStride;
1295     }
1296 }
1297
1298 #if CONFIG_RV40_DECODER
1299 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1300     put_pixels16_xy2_8_c(dst, src, stride, 16);
1301 }
1302 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1303     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1304 }
1305 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1306     put_pixels8_xy2_8_c(dst, src, stride, 8);
1307 }
1308 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1309     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1310 }
1311 #endif /* CONFIG_RV40_DECODER */
1312
1313 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1314     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1315     int i;
1316
1317     for(i=0; i<w; i++){
1318         const int src_1= src[ -srcStride];
1319         const int src0 = src[0          ];
1320         const int src1 = src[  srcStride];
1321         const int src2 = src[2*srcStride];
1322         const int src3 = src[3*srcStride];
1323         const int src4 = src[4*srcStride];
1324         const int src5 = src[5*srcStride];
1325         const int src6 = src[6*srcStride];
1326         const int src7 = src[7*srcStride];
1327         const int src8 = src[8*srcStride];
1328         const int src9 = src[9*srcStride];
1329         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1330         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1331         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1332         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1333         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1334         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1335         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1336         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1337         src++;
1338         dst++;
1339     }
1340 }
1341
1342 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1343     uint8_t half[64];
1344     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1345     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1346 }
1347
1348 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1349     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1350 }
1351
1352 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1353     uint8_t half[64];
1354     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1355     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1356 }
1357
1358 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1359     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1360 }
1361
1362 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1363     uint8_t halfH[88];
1364     uint8_t halfV[64];
1365     uint8_t halfHV[64];
1366     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1367     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1368     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1369     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1370 }
1371 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1372     uint8_t halfH[88];
1373     uint8_t halfV[64];
1374     uint8_t halfHV[64];
1375     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1376     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1377     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1378     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1379 }
1380 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1381     uint8_t halfH[88];
1382     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1383     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1384 }
1385
1386 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1387     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1388     int x;
1389     const int strength= ff_h263_loop_filter_strength[qscale];
1390
1391     for(x=0; x<8; x++){
1392         int d1, d2, ad1;
1393         int p0= src[x-2*stride];
1394         int p1= src[x-1*stride];
1395         int p2= src[x+0*stride];
1396         int p3= src[x+1*stride];
1397         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1398
1399         if     (d<-2*strength) d1= 0;
1400         else if(d<-  strength) d1=-2*strength - d;
1401         else if(d<   strength) d1= d;
1402         else if(d< 2*strength) d1= 2*strength - d;
1403         else                   d1= 0;
1404
1405         p1 += d1;
1406         p2 -= d1;
1407         if(p1&256) p1= ~(p1>>31);
1408         if(p2&256) p2= ~(p2>>31);
1409
1410         src[x-1*stride] = p1;
1411         src[x+0*stride] = p2;
1412
1413         ad1= FFABS(d1)>>1;
1414
1415         d2= av_clip((p0-p3)/4, -ad1, ad1);
1416
1417         src[x-2*stride] = p0 - d2;
1418         src[x+  stride] = p3 + d2;
1419     }
1420     }
1421 }
1422
1423 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1424     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1425     int y;
1426     const int strength= ff_h263_loop_filter_strength[qscale];
1427
1428     for(y=0; y<8; y++){
1429         int d1, d2, ad1;
1430         int p0= src[y*stride-2];
1431         int p1= src[y*stride-1];
1432         int p2= src[y*stride+0];
1433         int p3= src[y*stride+1];
1434         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1435
1436         if     (d<-2*strength) d1= 0;
1437         else if(d<-  strength) d1=-2*strength - d;
1438         else if(d<   strength) d1= d;
1439         else if(d< 2*strength) d1= 2*strength - d;
1440         else                   d1= 0;
1441
1442         p1 += d1;
1443         p2 -= d1;
1444         if(p1&256) p1= ~(p1>>31);
1445         if(p2&256) p2= ~(p2>>31);
1446
1447         src[y*stride-1] = p1;
1448         src[y*stride+0] = p2;
1449
1450         ad1= FFABS(d1)>>1;
1451
1452         d2= av_clip((p0-p3)/4, -ad1, ad1);
1453
1454         src[y*stride-2] = p0 - d2;
1455         src[y*stride+1] = p3 + d2;
1456     }
1457     }
1458 }
1459
1460 static void h261_loop_filter_c(uint8_t *src, int stride){
1461     int x,y,xy,yz;
1462     int temp[64];
1463
1464     for(x=0; x<8; x++){
1465         temp[x      ] = 4*src[x           ];
1466         temp[x + 7*8] = 4*src[x + 7*stride];
1467     }
1468     for(y=1; y<7; y++){
1469         for(x=0; x<8; x++){
1470             xy = y * stride + x;
1471             yz = y * 8 + x;
1472             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1473         }
1474     }
1475
1476     for(y=0; y<8; y++){
1477         src[  y*stride] = (temp[  y*8] + 2)>>2;
1478         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1479         for(x=1; x<7; x++){
1480             xy = y * stride + x;
1481             yz = y * 8 + x;
1482             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1483         }
1484     }
1485 }
1486
1487 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1488 {
1489     int s, i;
1490
1491     s = 0;
1492     for(i=0;i<h;i++) {
1493         s += abs(pix1[0] - pix2[0]);
1494         s += abs(pix1[1] - pix2[1]);
1495         s += abs(pix1[2] - pix2[2]);
1496         s += abs(pix1[3] - pix2[3]);
1497         s += abs(pix1[4] - pix2[4]);
1498         s += abs(pix1[5] - pix2[5]);
1499         s += abs(pix1[6] - pix2[6]);
1500         s += abs(pix1[7] - pix2[7]);
1501         s += abs(pix1[8] - pix2[8]);
1502         s += abs(pix1[9] - pix2[9]);
1503         s += abs(pix1[10] - pix2[10]);
1504         s += abs(pix1[11] - pix2[11]);
1505         s += abs(pix1[12] - pix2[12]);
1506         s += abs(pix1[13] - pix2[13]);
1507         s += abs(pix1[14] - pix2[14]);
1508         s += abs(pix1[15] - pix2[15]);
1509         pix1 += line_size;
1510         pix2 += line_size;
1511     }
1512     return s;
1513 }
1514
1515 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1516 {
1517     int s, i;
1518
1519     s = 0;
1520     for(i=0;i<h;i++) {
1521         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1522         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1523         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1524         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1525         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1526         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1527         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1528         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1529         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1530         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1531         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1532         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1533         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1534         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1535         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1536         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1537         pix1 += line_size;
1538         pix2 += line_size;
1539     }
1540     return s;
1541 }
1542
1543 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1544 {
1545     int s, i;
1546     uint8_t *pix3 = pix2 + line_size;
1547
1548     s = 0;
1549     for(i=0;i<h;i++) {
1550         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1551         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1552         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1553         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1554         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1555         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1556         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1557         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1558         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1559         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1560         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1561         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1562         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1563         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1564         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1565         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1566         pix1 += line_size;
1567         pix2 += line_size;
1568         pix3 += line_size;
1569     }
1570     return s;
1571 }
1572
1573 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1574 {
1575     int s, i;
1576     uint8_t *pix3 = pix2 + line_size;
1577
1578     s = 0;
1579     for(i=0;i<h;i++) {
1580         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1581         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1582         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1583         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1584         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1585         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1586         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1587         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1588         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1589         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1590         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1591         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1592         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1593         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1594         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1595         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1596         pix1 += line_size;
1597         pix2 += line_size;
1598         pix3 += line_size;
1599     }
1600     return s;
1601 }
1602
1603 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1604 {
1605     int s, i;
1606
1607     s = 0;
1608     for(i=0;i<h;i++) {
1609         s += abs(pix1[0] - pix2[0]);
1610         s += abs(pix1[1] - pix2[1]);
1611         s += abs(pix1[2] - pix2[2]);
1612         s += abs(pix1[3] - pix2[3]);
1613         s += abs(pix1[4] - pix2[4]);
1614         s += abs(pix1[5] - pix2[5]);
1615         s += abs(pix1[6] - pix2[6]);
1616         s += abs(pix1[7] - pix2[7]);
1617         pix1 += line_size;
1618         pix2 += line_size;
1619     }
1620     return s;
1621 }
1622
1623 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1624 {
1625     int s, i;
1626
1627     s = 0;
1628     for(i=0;i<h;i++) {
1629         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1630         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1631         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1632         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1633         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1634         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1635         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1636         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1637         pix1 += line_size;
1638         pix2 += line_size;
1639     }
1640     return s;
1641 }
1642
1643 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1644 {
1645     int s, i;
1646     uint8_t *pix3 = pix2 + line_size;
1647
1648     s = 0;
1649     for(i=0;i<h;i++) {
1650         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1651         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1652         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1653         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1654         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1655         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1656         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1657         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1658         pix1 += line_size;
1659         pix2 += line_size;
1660         pix3 += line_size;
1661     }
1662     return s;
1663 }
1664
1665 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1666 {
1667     int s, i;
1668     uint8_t *pix3 = pix2 + line_size;
1669
1670     s = 0;
1671     for(i=0;i<h;i++) {
1672         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1673         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1674         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1675         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1676         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1677         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1678         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1679         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1680         pix1 += line_size;
1681         pix2 += line_size;
1682         pix3 += line_size;
1683     }
1684     return s;
1685 }
1686
1687 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1688     MpegEncContext *c = v;
1689     int score1=0;
1690     int score2=0;
1691     int x,y;
1692
1693     for(y=0; y<h; y++){
1694         for(x=0; x<16; x++){
1695             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1696         }
1697         if(y+1<h){
1698             for(x=0; x<15; x++){
1699                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1700                              - s1[x+1] + s1[x+1+stride])
1701                         -FFABS(  s2[x  ] - s2[x  +stride]
1702                              - s2[x+1] + s2[x+1+stride]);
1703             }
1704         }
1705         s1+= stride;
1706         s2+= stride;
1707     }
1708
1709     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1710     else  return score1 + FFABS(score2)*8;
1711 }
1712
1713 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1714     MpegEncContext *c = v;
1715     int score1=0;
1716     int score2=0;
1717     int x,y;
1718
1719     for(y=0; y<h; y++){
1720         for(x=0; x<8; x++){
1721             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1722         }
1723         if(y+1<h){
1724             for(x=0; x<7; x++){
1725                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1726                              - s1[x+1] + s1[x+1+stride])
1727                         -FFABS(  s2[x  ] - s2[x  +stride]
1728                              - s2[x+1] + s2[x+1+stride]);
1729             }
1730         }
1731         s1+= stride;
1732         s2+= stride;
1733     }
1734
1735     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1736     else  return score1 + FFABS(score2)*8;
1737 }
1738
1739 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1740     int i;
1741     unsigned int sum=0;
1742
1743     for(i=0; i<8*8; i++){
1744         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1745         int w= weight[i];
1746         b>>= RECON_SHIFT;
1747         assert(-512<b && b<512);
1748
1749         sum += (w*b)*(w*b)>>4;
1750     }
1751     return sum>>2;
1752 }
1753
1754 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1755     int i;
1756
1757     for(i=0; i<8*8; i++){
1758         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1759     }
1760 }
1761
1762 /**
1763  * permutes an 8x8 block.
1764  * @param block the block which will be permuted according to the given permutation vector
1765  * @param permutation the permutation vector
1766  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1767  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1768  *                  (inverse) permutated to scantable order!
1769  */
1770 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1771 {
1772     int i;
1773     DCTELEM temp[64];
1774
1775     if(last<=0) return;
1776     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1777
1778     for(i=0; i<=last; i++){
1779         const int j= scantable[i];
1780         temp[j]= block[j];
1781         block[j]=0;
1782     }
1783
1784     for(i=0; i<=last; i++){
1785         const int j= scantable[i];
1786         const int perm_j= permutation[j];
1787         block[perm_j]= temp[j];
1788     }
1789 }
1790
1791 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1792     return 0;
1793 }
1794
1795 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1796     int i;
1797
1798     memset(cmp, 0, sizeof(void*)*6);
1799
1800     for(i=0; i<6; i++){
1801         switch(type&0xFF){
1802         case FF_CMP_SAD:
1803             cmp[i]= c->sad[i];
1804             break;
1805         case FF_CMP_SATD:
1806             cmp[i]= c->hadamard8_diff[i];
1807             break;
1808         case FF_CMP_SSE:
1809             cmp[i]= c->sse[i];
1810             break;
1811         case FF_CMP_DCT:
1812             cmp[i]= c->dct_sad[i];
1813             break;
1814         case FF_CMP_DCT264:
1815             cmp[i]= c->dct264_sad[i];
1816             break;
1817         case FF_CMP_DCTMAX:
1818             cmp[i]= c->dct_max[i];
1819             break;
1820         case FF_CMP_PSNR:
1821             cmp[i]= c->quant_psnr[i];
1822             break;
1823         case FF_CMP_BIT:
1824             cmp[i]= c->bit[i];
1825             break;
1826         case FF_CMP_RD:
1827             cmp[i]= c->rd[i];
1828             break;
1829         case FF_CMP_VSAD:
1830             cmp[i]= c->vsad[i];
1831             break;
1832         case FF_CMP_VSSE:
1833             cmp[i]= c->vsse[i];
1834             break;
1835         case FF_CMP_ZERO:
1836             cmp[i]= zero_cmp;
1837             break;
1838         case FF_CMP_NSSE:
1839             cmp[i]= c->nsse[i];
1840             break;
1841 #if CONFIG_DWT
1842         case FF_CMP_W53:
1843             cmp[i]= c->w53[i];
1844             break;
1845         case FF_CMP_W97:
1846             cmp[i]= c->w97[i];
1847             break;
1848 #endif
1849         default:
1850             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1851         }
1852     }
1853 }
1854
1855 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1856     long i;
1857     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1858         long a = *(long*)(src+i);
1859         long b = *(long*)(dst+i);
1860         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1861     }
1862     for(; i<w; i++)
1863         dst[i+0] += src[i+0];
1864 }
1865
1866 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1867     long i;
1868     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1869         long a = *(long*)(src1+i);
1870         long b = *(long*)(src2+i);
1871         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1872     }
1873     for(; i<w; i++)
1874         dst[i] = src1[i]+src2[i];
1875 }
1876
1877 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1878     long i;
1879 #if !HAVE_FAST_UNALIGNED
1880     if((long)src2 & (sizeof(long)-1)){
1881         for(i=0; i+7<w; i+=8){
1882             dst[i+0] = src1[i+0]-src2[i+0];
1883             dst[i+1] = src1[i+1]-src2[i+1];
1884             dst[i+2] = src1[i+2]-src2[i+2];
1885             dst[i+3] = src1[i+3]-src2[i+3];
1886             dst[i+4] = src1[i+4]-src2[i+4];
1887             dst[i+5] = src1[i+5]-src2[i+5];
1888             dst[i+6] = src1[i+6]-src2[i+6];
1889             dst[i+7] = src1[i+7]-src2[i+7];
1890         }
1891     }else
1892 #endif
1893     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1894         long a = *(long*)(src1+i);
1895         long b = *(long*)(src2+i);
1896         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1897     }
1898     for(; i<w; i++)
1899         dst[i+0] = src1[i+0]-src2[i+0];
1900 }
1901
1902 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1903     int i;
1904     uint8_t l, lt;
1905
1906     l= *left;
1907     lt= *left_top;
1908
1909     for(i=0; i<w; i++){
1910         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1911         lt= src1[i];
1912         dst[i]= l;
1913     }
1914
1915     *left= l;
1916     *left_top= lt;
1917 }
1918
1919 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1920     int i;
1921     uint8_t l, lt;
1922
1923     l= *left;
1924     lt= *left_top;
1925
1926     for(i=0; i<w; i++){
1927         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1928         lt= src1[i];
1929         l= src2[i];
1930         dst[i]= l - pred;
1931     }
1932
1933     *left= l;
1934     *left_top= lt;
1935 }
1936
1937 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1938     int i;
1939
1940     for(i=0; i<w-1; i++){
1941         acc+= src[i];
1942         dst[i]= acc;
1943         i++;
1944         acc+= src[i];
1945         dst[i]= acc;
1946     }
1947
1948     for(; i<w; i++){
1949         acc+= src[i];
1950         dst[i]= acc;
1951     }
1952
1953     return acc;
1954 }
1955
1956 #if HAVE_BIGENDIAN
1957 #define B 3
1958 #define G 2
1959 #define R 1
1960 #define A 0
1961 #else
1962 #define B 0
1963 #define G 1
1964 #define R 2
1965 #define A 3
1966 #endif
1967 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1968     int i;
1969     int r,g,b,a;
1970     r= *red;
1971     g= *green;
1972     b= *blue;
1973     a= *alpha;
1974
1975     for(i=0; i<w; i++){
1976         b+= src[4*i+B];
1977         g+= src[4*i+G];
1978         r+= src[4*i+R];
1979         a+= src[4*i+A];
1980
1981         dst[4*i+B]= b;
1982         dst[4*i+G]= g;
1983         dst[4*i+R]= r;
1984         dst[4*i+A]= a;
1985     }
1986
1987     *red= r;
1988     *green= g;
1989     *blue= b;
1990     *alpha= a;
1991 }
1992 #undef B
1993 #undef G
1994 #undef R
1995 #undef A
1996
1997 #define BUTTERFLY2(o1,o2,i1,i2) \
1998 o1= (i1)+(i2);\
1999 o2= (i1)-(i2);
2000
2001 #define BUTTERFLY1(x,y) \
2002 {\
2003     int a,b;\
2004     a= x;\
2005     b= y;\
2006     x= a+b;\
2007     y= a-b;\
2008 }
2009
2010 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2011
2012 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2013     int i;
2014     int temp[64];
2015     int sum=0;
2016
2017     assert(h==8);
2018
2019     for(i=0; i<8; i++){
2020         //FIXME try pointer walks
2021         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2022         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2023         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2024         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2025
2026         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2027         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2028         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2029         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2030
2031         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2032         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2033         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2034         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2035     }
2036
2037     for(i=0; i<8; i++){
2038         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2039         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2040         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2041         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2042
2043         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2044         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2045         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2046         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2047
2048         sum +=
2049              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2050             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2051             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2052             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2053     }
2054     return sum;
2055 }
2056
2057 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2058     int i;
2059     int temp[64];
2060     int sum=0;
2061
2062     assert(h==8);
2063
2064     for(i=0; i<8; i++){
2065         //FIXME try pointer walks
2066         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2067         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2068         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2069         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2070
2071         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2072         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2073         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2074         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2075
2076         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2077         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2078         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2079         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2080     }
2081
2082     for(i=0; i<8; i++){
2083         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2084         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2085         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2086         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2087
2088         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2089         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2090         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2091         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2092
2093         sum +=
2094              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2095             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2096             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2097             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2098     }
2099
2100     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2101
2102     return sum;
2103 }
2104
2105 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2106     MpegEncContext * const s= (MpegEncContext *)c;
2107     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2108
2109     assert(h==8);
2110
2111     s->dsp.diff_pixels(temp, src1, src2, stride);
2112     s->dsp.fdct(temp);
2113     return s->dsp.sum_abs_dctelem(temp);
2114 }
2115
2116 #if CONFIG_GPL
2117 #define DCT8_1D {\
2118     const int s07 = SRC(0) + SRC(7);\
2119     const int s16 = SRC(1) + SRC(6);\
2120     const int s25 = SRC(2) + SRC(5);\
2121     const int s34 = SRC(3) + SRC(4);\
2122     const int a0 = s07 + s34;\
2123     const int a1 = s16 + s25;\
2124     const int a2 = s07 - s34;\
2125     const int a3 = s16 - s25;\
2126     const int d07 = SRC(0) - SRC(7);\
2127     const int d16 = SRC(1) - SRC(6);\
2128     const int d25 = SRC(2) - SRC(5);\
2129     const int d34 = SRC(3) - SRC(4);\
2130     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2131     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2132     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2133     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2134     DST(0,  a0 + a1     ) ;\
2135     DST(1,  a4 + (a7>>2)) ;\
2136     DST(2,  a2 + (a3>>1)) ;\
2137     DST(3,  a5 + (a6>>2)) ;\
2138     DST(4,  a0 - a1     ) ;\
2139     DST(5,  a6 - (a5>>2)) ;\
2140     DST(6, (a2>>1) - a3 ) ;\
2141     DST(7, (a4>>2) - a7 ) ;\
2142 }
2143
2144 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2145     MpegEncContext * const s= (MpegEncContext *)c;
2146     DCTELEM dct[8][8];
2147     int i;
2148     int sum=0;
2149
2150     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2151
2152 #define SRC(x) dct[i][x]
2153 #define DST(x,v) dct[i][x]= v
2154     for( i = 0; i < 8; i++ )
2155         DCT8_1D
2156 #undef SRC
2157 #undef DST
2158
2159 #define SRC(x) dct[x][i]
2160 #define DST(x,v) sum += FFABS(v)
2161     for( i = 0; i < 8; i++ )
2162         DCT8_1D
2163 #undef SRC
2164 #undef DST
2165     return sum;
2166 }
2167 #endif
2168
2169 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2170     MpegEncContext * const s= (MpegEncContext *)c;
2171     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2172     int sum=0, i;
2173
2174     assert(h==8);
2175
2176     s->dsp.diff_pixels(temp, src1, src2, stride);
2177     s->dsp.fdct(temp);
2178
2179     for(i=0; i<64; i++)
2180         sum= FFMAX(sum, FFABS(temp[i]));
2181
2182     return sum;
2183 }
2184
2185 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2186     MpegEncContext * const s= (MpegEncContext *)c;
2187     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2188     DCTELEM * const bak = temp+64;
2189     int sum=0, i;
2190
2191     assert(h==8);
2192     s->mb_intra=0;
2193
2194     s->dsp.diff_pixels(temp, src1, src2, stride);
2195
2196     memcpy(bak, temp, 64*sizeof(DCTELEM));
2197
2198     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2199     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2200     ff_simple_idct_8(temp); //FIXME
2201
2202     for(i=0; i<64; i++)
2203         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2204
2205     return sum;
2206 }
2207
2208 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2209     MpegEncContext * const s= (MpegEncContext *)c;
2210     const uint8_t *scantable= s->intra_scantable.permutated;
2211     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2212     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2213     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2214     int i, last, run, bits, level, distortion, start_i;
2215     const int esc_length= s->ac_esc_length;
2216     uint8_t * length;
2217     uint8_t * last_length;
2218
2219     assert(h==8);
2220
2221     copy_block8(lsrc1, src1, 8, stride, 8);
2222     copy_block8(lsrc2, src2, 8, stride, 8);
2223
2224     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2225
2226     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2227
2228     bits=0;
2229
2230     if (s->mb_intra) {
2231         start_i = 1;
2232         length     = s->intra_ac_vlc_length;
2233         last_length= s->intra_ac_vlc_last_length;
2234         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2235     } else {
2236         start_i = 0;
2237         length     = s->inter_ac_vlc_length;
2238         last_length= s->inter_ac_vlc_last_length;
2239     }
2240
2241     if(last>=start_i){
2242         run=0;
2243         for(i=start_i; i<last; i++){
2244             int j= scantable[i];
2245             level= temp[j];
2246
2247             if(level){
2248                 level+=64;
2249                 if((level&(~127)) == 0){
2250                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2251                 }else
2252                     bits+= esc_length;
2253                 run=0;
2254             }else
2255                 run++;
2256         }
2257         i= scantable[last];
2258
2259         level= temp[i] + 64;
2260
2261         assert(level - 64);
2262
2263         if((level&(~127)) == 0){
2264             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2265         }else
2266             bits+= esc_length;
2267
2268     }
2269
2270     if(last>=0){
2271         if(s->mb_intra)
2272             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2273         else
2274             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2275     }
2276
2277     s->dsp.idct_add(lsrc2, 8, temp);
2278
2279     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2280
2281     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2282 }
2283
2284 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2285     MpegEncContext * const s= (MpegEncContext *)c;
2286     const uint8_t *scantable= s->intra_scantable.permutated;
2287     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2288     int i, last, run, bits, level, start_i;
2289     const int esc_length= s->ac_esc_length;
2290     uint8_t * length;
2291     uint8_t * last_length;
2292
2293     assert(h==8);
2294
2295     s->dsp.diff_pixels(temp, src1, src2, stride);
2296
2297     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2298
2299     bits=0;
2300
2301     if (s->mb_intra) {
2302         start_i = 1;
2303         length     = s->intra_ac_vlc_length;
2304         last_length= s->intra_ac_vlc_last_length;
2305         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2306     } else {
2307         start_i = 0;
2308         length     = s->inter_ac_vlc_length;
2309         last_length= s->inter_ac_vlc_last_length;
2310     }
2311
2312     if(last>=start_i){
2313         run=0;
2314         for(i=start_i; i<last; i++){
2315             int j= scantable[i];
2316             level= temp[j];
2317
2318             if(level){
2319                 level+=64;
2320                 if((level&(~127)) == 0){
2321                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2322                 }else
2323                     bits+= esc_length;
2324                 run=0;
2325             }else
2326                 run++;
2327         }
2328         i= scantable[last];
2329
2330         level= temp[i] + 64;
2331
2332         assert(level - 64);
2333
2334         if((level&(~127)) == 0){
2335             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2336         }else
2337             bits+= esc_length;
2338     }
2339
2340     return bits;
2341 }
2342
2343 #define VSAD_INTRA(size) \
2344 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2345     int score=0;                                                                                            \
2346     int x,y;                                                                                                \
2347                                                                                                             \
2348     for(y=1; y<h; y++){                                                                                     \
2349         for(x=0; x<size; x+=4){                                                                             \
2350             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2351                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2352         }                                                                                                   \
2353         s+= stride;                                                                                         \
2354     }                                                                                                       \
2355                                                                                                             \
2356     return score;                                                                                           \
2357 }
2358 VSAD_INTRA(8)
2359 VSAD_INTRA(16)
2360
2361 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2362     int score=0;
2363     int x,y;
2364
2365     for(y=1; y<h; y++){
2366         for(x=0; x<16; x++){
2367             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2368         }
2369         s1+= stride;
2370         s2+= stride;
2371     }
2372
2373     return score;
2374 }
2375
2376 #define SQ(a) ((a)*(a))
2377 #define VSSE_INTRA(size) \
2378 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2379     int score=0;                                                                                            \
2380     int x,y;                                                                                                \
2381                                                                                                             \
2382     for(y=1; y<h; y++){                                                                                     \
2383         for(x=0; x<size; x+=4){                                                                               \
2384             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2385                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2386         }                                                                                                   \
2387         s+= stride;                                                                                         \
2388     }                                                                                                       \
2389                                                                                                             \
2390     return score;                                                                                           \
2391 }
2392 VSSE_INTRA(8)
2393 VSSE_INTRA(16)
2394
2395 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2396     int score=0;
2397     int x,y;
2398
2399     for(y=1; y<h; y++){
2400         for(x=0; x<16; x++){
2401             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2402         }
2403         s1+= stride;
2404         s2+= stride;
2405     }
2406
2407     return score;
2408 }
2409
2410 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2411                                int size){
2412     int score=0;
2413     int i;
2414     for(i=0; i<size; i++)
2415         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2416     return score;
2417 }
2418
2419 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2420 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2421 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2422 #if CONFIG_GPL
2423 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2424 #endif
2425 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2426 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2427 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2428 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2429
2430 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2431     int i;
2432     for(i=0; i<len; i++)
2433         dst[i] = src0[i] * src1[i];
2434 }
2435
2436 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2437     int i;
2438     src1 += len-1;
2439     for(i=0; i<len; i++)
2440         dst[i] = src0[i] * src1[-i];
2441 }
2442
2443 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2444     int i;
2445     for(i=0; i<len; i++)
2446         dst[i] = src0[i] * src1[i] + src2[i];
2447 }
2448
2449 static void vector_fmul_window_c(float *dst, const float *src0,
2450                                  const float *src1, const float *win, int len)
2451 {
2452     int i,j;
2453     dst += len;
2454     win += len;
2455     src0+= len;
2456     for(i=-len, j=len-1; i<0; i++, j--) {
2457         float s0 = src0[i];
2458         float s1 = src1[j];
2459         float wi = win[i];
2460         float wj = win[j];
2461         dst[i] = s0*wj - s1*wi;
2462         dst[j] = s0*wi + s1*wj;
2463     }
2464 }
2465
2466 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2467                                  int len)
2468 {
2469     int i;
2470     for (i = 0; i < len; i++)
2471         dst[i] = src[i] * mul;
2472 }
2473
2474 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2475                                 int len)
2476 {
2477     int i;
2478     for (i = 0; i < len; i++) {
2479         float t = v1[i] - v2[i];
2480         v1[i] += v2[i];
2481         v2[i] = t;
2482     }
2483 }
2484
2485 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2486 {
2487     float p = 0.0;
2488     int i;
2489
2490     for (i = 0; i < len; i++)
2491         p += v1[i] * v2[i];
2492
2493     return p;
2494 }
2495
2496 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2497                    uint32_t maxi, uint32_t maxisign)
2498 {
2499
2500     if(a > mini) return mini;
2501     else if((a^(1U<<31)) > maxisign) return maxi;
2502     else return a;
2503 }
2504
2505 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2506     int i;
2507     uint32_t mini = *(uint32_t*)min;
2508     uint32_t maxi = *(uint32_t*)max;
2509     uint32_t maxisign = maxi ^ (1U<<31);
2510     uint32_t *dsti = (uint32_t*)dst;
2511     const uint32_t *srci = (const uint32_t*)src;
2512     for(i=0; i<len; i+=8) {
2513         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2514         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2515         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2516         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2517         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2518         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2519         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2520         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2521     }
2522 }
2523 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2524     int i;
2525     if(min < 0 && max > 0) {
2526         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2527     } else {
2528         for(i=0; i < len; i+=8) {
2529             dst[i    ] = av_clipf(src[i    ], min, max);
2530             dst[i + 1] = av_clipf(src[i + 1], min, max);
2531             dst[i + 2] = av_clipf(src[i + 2], min, max);
2532             dst[i + 3] = av_clipf(src[i + 3], min, max);
2533             dst[i + 4] = av_clipf(src[i + 4], min, max);
2534             dst[i + 5] = av_clipf(src[i + 5], min, max);
2535             dst[i + 6] = av_clipf(src[i + 6], min, max);
2536             dst[i + 7] = av_clipf(src[i + 7], min, max);
2537         }
2538     }
2539 }
2540
2541 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2542 {
2543     int res = 0;
2544
2545     while (order--)
2546         res += (*v1++ * *v2++) >> shift;
2547
2548     return res;
2549 }
2550
2551 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2552 {
2553     int res = 0;
2554     while (order--) {
2555         res   += *v1 * *v2++;
2556         *v1++ += mul * *v3++;
2557     }
2558     return res;
2559 }
2560
2561 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2562                                  const int16_t *window, unsigned int len)
2563 {
2564     int i;
2565     int len2 = len >> 1;
2566
2567     for (i = 0; i < len2; i++) {
2568         int16_t w       = window[i];
2569         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2570         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2571     }
2572 }
2573
2574 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2575                                 int32_t max, unsigned int len)
2576 {
2577     do {
2578         *dst++ = av_clip(*src++, min, max);
2579         *dst++ = av_clip(*src++, min, max);
2580         *dst++ = av_clip(*src++, min, max);
2581         *dst++ = av_clip(*src++, min, max);
2582         *dst++ = av_clip(*src++, min, max);
2583         *dst++ = av_clip(*src++, min, max);
2584         *dst++ = av_clip(*src++, min, max);
2585         *dst++ = av_clip(*src++, min, max);
2586         len -= 8;
2587     } while (len > 0);
2588 }
2589
2590 #define W0 2048
2591 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2592 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2593 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2594 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2595 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2596 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2597 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2598
2599 static void wmv2_idct_row(short * b)
2600 {
2601     int s1,s2;
2602     int a0,a1,a2,a3,a4,a5,a6,a7;
2603     /*step 1*/
2604     a1 = W1*b[1]+W7*b[7];
2605     a7 = W7*b[1]-W1*b[7];
2606     a5 = W5*b[5]+W3*b[3];
2607     a3 = W3*b[5]-W5*b[3];
2608     a2 = W2*b[2]+W6*b[6];
2609     a6 = W6*b[2]-W2*b[6];
2610     a0 = W0*b[0]+W0*b[4];
2611     a4 = W0*b[0]-W0*b[4];
2612     /*step 2*/
2613     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2614     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2615     /*step 3*/
2616     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2617     b[1] = (a4+a6 +s1   + (1<<7))>>8;
2618     b[2] = (a4-a6 +s2   + (1<<7))>>8;
2619     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2620     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2621     b[5] = (a4-a6 -s2   + (1<<7))>>8;
2622     b[6] = (a4+a6 -s1   + (1<<7))>>8;
2623     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2624 }
2625 static void wmv2_idct_col(short * b)
2626 {
2627     int s1,s2;
2628     int a0,a1,a2,a3,a4,a5,a6,a7;
2629     /*step 1, with extended precision*/
2630     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2631     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2632     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2633     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2634     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2635     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2636     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2637     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2638     /*step 2*/
2639     s1 = (181*(a1-a5+a7-a3)+128)>>8;
2640     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2641     /*step 3*/
2642     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2643     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2644     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2645     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2646
2647     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2648     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2649     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2650     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2651 }
2652 void ff_wmv2_idct_c(short * block){
2653     int i;
2654
2655     for(i=0;i<64;i+=8){
2656         wmv2_idct_row(block+i);
2657     }
2658     for(i=0;i<8;i++){
2659         wmv2_idct_col(block+i);
2660     }
2661 }
2662 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2663  converted */
2664 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2665 {
2666     ff_wmv2_idct_c(block);
2667     ff_put_pixels_clamped_c(block, dest, line_size);
2668 }
2669 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2670 {
2671     ff_wmv2_idct_c(block);
2672     ff_add_pixels_clamped_c(block, dest, line_size);
2673 }
2674 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2675 {
2676     j_rev_dct (block);
2677     ff_put_pixels_clamped_c(block, dest, line_size);
2678 }
2679 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2680 {
2681     j_rev_dct (block);
2682     ff_add_pixels_clamped_c(block, dest, line_size);
2683 }
2684
2685 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2686 {
2687     j_rev_dct4 (block);
2688     put_pixels_clamped4_c(block, dest, line_size);
2689 }
2690 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2691 {
2692     j_rev_dct4 (block);
2693     add_pixels_clamped4_c(block, dest, line_size);
2694 }
2695
2696 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2697 {
2698     j_rev_dct2 (block);
2699     put_pixels_clamped2_c(block, dest, line_size);
2700 }
2701 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2702 {
2703     j_rev_dct2 (block);
2704     add_pixels_clamped2_c(block, dest, line_size);
2705 }
2706
2707 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2708 {
2709     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2710
2711     dest[0] = cm[(block[0] + 4)>>3];
2712 }
2713 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2714 {
2715     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2716
2717     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2718 }
2719
2720 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2721
2722 /* init static data */
2723 av_cold void dsputil_static_init(void)
2724 {
2725     int i;
2726
2727     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2728     for(i=0;i<MAX_NEG_CROP;i++) {
2729         ff_cropTbl[i] = 0;
2730         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2731     }
2732
2733     for(i=0;i<512;i++) {
2734         ff_squareTbl[i] = (i - 256) * (i - 256);
2735     }
2736
2737     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2738 }
2739
2740 int ff_check_alignment(void){
2741     static int did_fail=0;
2742     LOCAL_ALIGNED_16(int, aligned, [4]);
2743
2744     if((intptr_t)aligned & 15){
2745         if(!did_fail){
2746 #if HAVE_MMX || HAVE_ALTIVEC
2747             av_log(NULL, AV_LOG_ERROR,
2748                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2749                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2750                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2751                 "Do not report crashes to Libav developers.\n");
2752 #endif
2753             did_fail=1;
2754         }
2755         return -1;
2756     }
2757     return 0;
2758 }
2759
2760 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2761 {
2762     int i;
2763
2764     ff_check_alignment();
2765
2766 #if CONFIG_ENCODERS
2767     if (avctx->bits_per_raw_sample == 10) {
2768         c->fdct    = ff_jpeg_fdct_islow_10;
2769         c->fdct248 = ff_fdct248_islow_10;
2770     } else {
2771         if(avctx->dct_algo==FF_DCT_FASTINT) {
2772             c->fdct    = fdct_ifast;
2773             c->fdct248 = fdct_ifast248;
2774         }
2775         else if(avctx->dct_algo==FF_DCT_FAAN) {
2776             c->fdct    = ff_faandct;
2777             c->fdct248 = ff_faandct248;
2778         }
2779         else {
2780             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2781             c->fdct248 = ff_fdct248_islow_8;
2782         }
2783     }
2784 #endif //CONFIG_ENCODERS
2785
2786     if(avctx->lowres==1){
2787         c->idct_put= ff_jref_idct4_put;
2788         c->idct_add= ff_jref_idct4_add;
2789         c->idct    = j_rev_dct4;
2790         c->idct_permutation_type= FF_NO_IDCT_PERM;
2791     }else if(avctx->lowres==2){
2792         c->idct_put= ff_jref_idct2_put;
2793         c->idct_add= ff_jref_idct2_add;
2794         c->idct    = j_rev_dct2;
2795         c->idct_permutation_type= FF_NO_IDCT_PERM;
2796     }else if(avctx->lowres==3){
2797         c->idct_put= ff_jref_idct1_put;
2798         c->idct_add= ff_jref_idct1_add;
2799         c->idct    = j_rev_dct1;
2800         c->idct_permutation_type= FF_NO_IDCT_PERM;
2801     }else{
2802         if (avctx->bits_per_raw_sample == 10) {
2803             c->idct_put              = ff_simple_idct_put_10;
2804             c->idct_add              = ff_simple_idct_add_10;
2805             c->idct                  = ff_simple_idct_10;
2806             c->idct_permutation_type = FF_NO_IDCT_PERM;
2807         } else {
2808         if(avctx->idct_algo==FF_IDCT_INT){
2809             c->idct_put= ff_jref_idct_put;
2810             c->idct_add= ff_jref_idct_add;
2811             c->idct    = j_rev_dct;
2812             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2813         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2814                 avctx->idct_algo==FF_IDCT_VP3){
2815             c->idct_put= ff_vp3_idct_put_c;
2816             c->idct_add= ff_vp3_idct_add_c;
2817             c->idct    = ff_vp3_idct_c;
2818             c->idct_permutation_type= FF_NO_IDCT_PERM;
2819         }else if(avctx->idct_algo==FF_IDCT_WMV2){
2820             c->idct_put= ff_wmv2_idct_put_c;
2821             c->idct_add= ff_wmv2_idct_add_c;
2822             c->idct    = ff_wmv2_idct_c;
2823             c->idct_permutation_type= FF_NO_IDCT_PERM;
2824         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2825             c->idct_put= ff_faanidct_put;
2826             c->idct_add= ff_faanidct_add;
2827             c->idct    = ff_faanidct;
2828             c->idct_permutation_type= FF_NO_IDCT_PERM;
2829         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2830             c->idct_put= ff_ea_idct_put_c;
2831             c->idct_permutation_type= FF_NO_IDCT_PERM;
2832         }else{ //accurate/default
2833             c->idct_put = ff_simple_idct_put_8;
2834             c->idct_add = ff_simple_idct_add_8;
2835             c->idct     = ff_simple_idct_8;
2836             c->idct_permutation_type= FF_NO_IDCT_PERM;
2837         }
2838         }
2839     }
2840
2841     c->diff_pixels = diff_pixels_c;
2842     c->put_pixels_clamped = ff_put_pixels_clamped_c;
2843     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2844     c->add_pixels_clamped = ff_add_pixels_clamped_c;
2845     c->sum_abs_dctelem = sum_abs_dctelem_c;
2846     c->gmc1 = gmc1_c;
2847     c->gmc = ff_gmc_c;
2848     c->pix_sum = pix_sum_c;
2849     c->pix_norm1 = pix_norm1_c;
2850
2851     c->fill_block_tab[0] = fill_block16_c;
2852     c->fill_block_tab[1] = fill_block8_c;
2853     c->scale_block = scale_block_c;
2854
2855     /* TODO [0] 16  [1] 8 */
2856     c->pix_abs[0][0] = pix_abs16_c;
2857     c->pix_abs[0][1] = pix_abs16_x2_c;
2858     c->pix_abs[0][2] = pix_abs16_y2_c;
2859     c->pix_abs[0][3] = pix_abs16_xy2_c;
2860     c->pix_abs[1][0] = pix_abs8_c;
2861     c->pix_abs[1][1] = pix_abs8_x2_c;
2862     c->pix_abs[1][2] = pix_abs8_y2_c;
2863     c->pix_abs[1][3] = pix_abs8_xy2_c;
2864
2865     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2866     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2867     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2868     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2869     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2870     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2871     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2872     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2873     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2874
2875     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2876     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2877     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2878     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2879     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2880     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2881     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2882     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2883     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2884
2885 #define dspfunc(PFX, IDX, NUM) \
2886     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2887     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2888     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2889     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2890     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2891     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2892     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2893     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2894     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2895     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2896     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2897     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2898     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2899     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2900     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2901     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2902
2903     dspfunc(put_qpel, 0, 16);
2904     dspfunc(put_no_rnd_qpel, 0, 16);
2905
2906     dspfunc(avg_qpel, 0, 16);
2907     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2908
2909     dspfunc(put_qpel, 1, 8);
2910     dspfunc(put_no_rnd_qpel, 1, 8);
2911
2912     dspfunc(avg_qpel, 1, 8);
2913     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2914
2915 #undef dspfunc
2916
2917 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2918     ff_mlp_init(c, avctx);
2919 #endif
2920 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2921     ff_intrax8dsp_init(c,avctx);
2922 #endif
2923 #if CONFIG_RV30_DECODER
2924     ff_rv30dsp_init(c,avctx);
2925 #endif
2926 #if CONFIG_RV40_DECODER
2927     ff_rv40dsp_init(c,avctx);
2928     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
2929     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
2930     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
2931     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
2932 #endif
2933
2934     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2935     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2936     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2937     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2938     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2939     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2940     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2941     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2942
2943 #define SET_CMP_FUNC(name) \
2944     c->name[0]= name ## 16_c;\
2945     c->name[1]= name ## 8x8_c;
2946
2947     SET_CMP_FUNC(hadamard8_diff)
2948     c->hadamard8_diff[4]= hadamard8_intra16_c;
2949     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2950     SET_CMP_FUNC(dct_sad)
2951     SET_CMP_FUNC(dct_max)
2952 #if CONFIG_GPL
2953     SET_CMP_FUNC(dct264_sad)
2954 #endif
2955     c->sad[0]= pix_abs16_c;
2956     c->sad[1]= pix_abs8_c;
2957     c->sse[0]= sse16_c;
2958     c->sse[1]= sse8_c;
2959     c->sse[2]= sse4_c;
2960     SET_CMP_FUNC(quant_psnr)
2961     SET_CMP_FUNC(rd)
2962     SET_CMP_FUNC(bit)
2963     c->vsad[0]= vsad16_c;
2964     c->vsad[4]= vsad_intra16_c;
2965     c->vsad[5]= vsad_intra8_c;
2966     c->vsse[0]= vsse16_c;
2967     c->vsse[4]= vsse_intra16_c;
2968     c->vsse[5]= vsse_intra8_c;
2969     c->nsse[0]= nsse16_c;
2970     c->nsse[1]= nsse8_c;
2971 #if CONFIG_DWT
2972     ff_dsputil_init_dwt(c);
2973 #endif
2974
2975     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2976
2977     c->add_bytes= add_bytes_c;
2978     c->add_bytes_l2= add_bytes_l2_c;
2979     c->diff_bytes= diff_bytes_c;
2980     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2981     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2982     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
2983     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2984     c->bswap_buf= bswap_buf;
2985     c->bswap16_buf = bswap16_buf;
2986 #if CONFIG_PNG_DECODER
2987     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
2988 #endif
2989
2990     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2991         c->h263_h_loop_filter= h263_h_loop_filter_c;
2992         c->h263_v_loop_filter= h263_v_loop_filter_c;
2993     }
2994
2995     if (CONFIG_VP3_DECODER) {
2996         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
2997         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
2998         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
2999     }
3000
3001     c->h261_loop_filter= h261_loop_filter_c;
3002
3003     c->try_8x8basis= try_8x8basis_c;
3004     c->add_8x8basis= add_8x8basis_c;
3005
3006 #if CONFIG_VORBIS_DECODER
3007     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3008 #endif
3009 #if CONFIG_AC3_DECODER
3010     c->ac3_downmix = ff_ac3_downmix_c;
3011 #endif
3012     c->vector_fmul = vector_fmul_c;
3013     c->vector_fmul_reverse = vector_fmul_reverse_c;
3014     c->vector_fmul_add = vector_fmul_add_c;
3015     c->vector_fmul_window = vector_fmul_window_c;
3016     c->vector_clipf = vector_clipf_c;
3017     c->scalarproduct_int16 = scalarproduct_int16_c;
3018     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3019     c->apply_window_int16 = apply_window_int16_c;
3020     c->vector_clip_int32 = vector_clip_int32_c;
3021     c->scalarproduct_float = scalarproduct_float_c;
3022     c->butterflies_float = butterflies_float_c;
3023     c->vector_fmul_scalar = vector_fmul_scalar_c;
3024
3025     c->shrink[0]= av_image_copy_plane;
3026     c->shrink[1]= ff_shrink22;
3027     c->shrink[2]= ff_shrink44;
3028     c->shrink[3]= ff_shrink88;
3029
3030     c->prefetch= just_return;
3031
3032     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3033     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3034
3035 #undef FUNC
3036 #undef FUNCC
3037 #define FUNC(f, depth) f ## _ ## depth
3038 #define FUNCC(f, depth) f ## _ ## depth ## _c
3039
3040 #define dspfunc1(PFX, IDX, NUM, depth)\
3041     c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
3042     c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3043     c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3044     c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3045
3046 #define dspfunc2(PFX, IDX, NUM, depth)\
3047     c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3048     c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3049     c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3050     c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3051     c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3052     c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3053     c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3054     c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3055     c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3056     c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3057     c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3058     c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3059     c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3060     c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3061     c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3062     c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3063
3064
3065 #define BIT_DEPTH_FUNCS(depth, dct)\
3066     c->get_pixels                    = FUNCC(get_pixels   ## dct   , depth);\
3067     c->draw_edges                    = FUNCC(draw_edges            , depth);\
3068     c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
3069     c->clear_block                   = FUNCC(clear_block  ## dct   , depth);\
3070     c->clear_blocks                  = FUNCC(clear_blocks ## dct   , depth);\
3071     c->add_pixels8                   = FUNCC(add_pixels8  ## dct   , depth);\
3072     c->add_pixels4                   = FUNCC(add_pixels4  ## dct   , depth);\
3073     c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
3074     c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3075 \
3076     c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
3077     c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
3078     c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
3079     c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
3080     c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
3081     c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
3082 \
3083     dspfunc1(put       , 0, 16, depth);\
3084     dspfunc1(put       , 1,  8, depth);\
3085     dspfunc1(put       , 2,  4, depth);\
3086     dspfunc1(put       , 3,  2, depth);\
3087     dspfunc1(put_no_rnd, 0, 16, depth);\
3088     dspfunc1(put_no_rnd, 1,  8, depth);\
3089     dspfunc1(avg       , 0, 16, depth);\
3090     dspfunc1(avg       , 1,  8, depth);\
3091     dspfunc1(avg       , 2,  4, depth);\
3092     dspfunc1(avg       , 3,  2, depth);\
3093     dspfunc1(avg_no_rnd, 0, 16, depth);\
3094     dspfunc1(avg_no_rnd, 1,  8, depth);\
3095 \
3096     dspfunc2(put_h264_qpel, 0, 16, depth);\
3097     dspfunc2(put_h264_qpel, 1,  8, depth);\
3098     dspfunc2(put_h264_qpel, 2,  4, depth);\
3099     dspfunc2(put_h264_qpel, 3,  2, depth);\
3100     dspfunc2(avg_h264_qpel, 0, 16, depth);\
3101     dspfunc2(avg_h264_qpel, 1,  8, depth);\
3102     dspfunc2(avg_h264_qpel, 2,  4, depth);
3103
3104     switch (avctx->bits_per_raw_sample) {
3105     case 9:
3106         if (c->dct_bits == 32) {
3107             BIT_DEPTH_FUNCS(9, _32);
3108         } else {
3109             BIT_DEPTH_FUNCS(9, _16);
3110         }
3111         break;
3112     case 10:
3113         if (c->dct_bits == 32) {
3114             BIT_DEPTH_FUNCS(10, _32);
3115         } else {
3116             BIT_DEPTH_FUNCS(10, _16);
3117         }
3118         break;
3119     default:
3120         av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3121     case 8:
3122         BIT_DEPTH_FUNCS(8, _16);
3123         break;
3124     }
3125
3126
3127     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
3128     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
3129     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
3130     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
3131     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
3132     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
3133     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
3134     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
3135     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
3136
3137     for(i=0; i<64; i++){
3138         if(!c->put_2tap_qpel_pixels_tab[0][i])
3139             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3140         if(!c->avg_2tap_qpel_pixels_tab[0][i])
3141             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3142     }
3143
3144     c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3145     c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3146     c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3147     c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3148
3149     c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3150     c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3151     c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3152     c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3153
3154     switch(c->idct_permutation_type){
3155     case FF_NO_IDCT_PERM:
3156         for(i=0; i<64; i++)
3157             c->idct_permutation[i]= i;
3158         break;
3159     case FF_LIBMPEG2_IDCT_PERM:
3160         for(i=0; i<64; i++)
3161             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3162         break;
3163     case FF_SIMPLE_IDCT_PERM:
3164         for(i=0; i<64; i++)
3165             c->idct_permutation[i]= simple_mmx_permutation[i];
3166         break;
3167     case FF_TRANSPOSE_IDCT_PERM:
3168         for(i=0; i<64; i++)
3169             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3170         break;
3171     case FF_PARTTRANS_IDCT_PERM:
3172         for(i=0; i<64; i++)
3173             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3174         break;
3175     case FF_SSE2_IDCT_PERM:
3176         for(i=0; i<64; i++)
3177             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3178         break;
3179     default:
3180         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3181     }
3182 }