git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41 #include "png.h"
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 #define BIT_DEPTH 9
  47 #include "dsputil_template.c"
  48 #undef BIT_DEPTH
  49
  50 #define BIT_DEPTH 10
  51 #include "dsputil_template.c"
  52 #undef BIT_DEPTH
  53
  54 #define BIT_DEPTH 8
  55 #include "dsputil_template.c"
  56
  57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  58 #define pb_7f (~0UL/255 * 0x7f)
  59 #define pb_80 (~0UL/255 * 0x80)
  60
  61 const uint8_t ff_zigzag_direct[64] = {
  62     0,   1,  8, 16,  9,  2,  3, 10,
  63     17, 24, 32, 25, 18, 11,  4,  5,
  64     12, 19, 26, 33, 40, 48, 41, 34,
  65     27, 20, 13,  6,  7, 14, 21, 28,
  66     35, 42, 49, 56, 57, 50, 43, 36,
  67     29, 22, 15, 23, 30, 37, 44, 51,
  68     58, 59, 52, 45, 38, 31, 39, 46,
  69     53, 60, 61, 54, 47, 55, 62, 63
  70 };
  71
  72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  73    specification, we interleave the fields */
  74 const uint8_t ff_zigzag248_direct[64] = {
  75      0,  8,  1,  9, 16, 24,  2, 10,
  76     17, 25, 32, 40, 48, 56, 33, 41,
  77     18, 26,  3, 11,  4, 12, 19, 27,
  78     34, 42, 49, 57, 50, 58, 35, 43,
  79     20, 28,  5, 13,  6, 14, 21, 29,
  80     36, 44, 51, 59, 52, 60, 37, 45,
  81     22, 30,  7, 15, 23, 31, 38, 46,
  82     53, 61, 54, 62, 39, 47, 55, 63,
  83 };
  84
  85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  86 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  87
  88 const uint8_t ff_alternate_horizontal_scan[64] = {
  89     0,  1,   2,  3,  8,  9, 16, 17,
  90     10, 11,  4,  5,  6,  7, 15, 14,
  91     13, 12, 19, 18, 24, 25, 32, 33,
  92     26, 27, 20, 21, 22, 23, 28, 29,
  93     30, 31, 34, 35, 40, 41, 48, 49,
  94     42, 43, 36, 37, 38, 39, 44, 45,
  95     46, 47, 50, 51, 56, 57, 58, 59,
  96     52, 53, 54, 55, 60, 61, 62, 63,
  97 };
  98
  99 const uint8_t ff_alternate_vertical_scan[64] = {
 100     0,  8,  16, 24,  1,  9,  2, 10,
 101     17, 25, 32, 40, 48, 56, 57, 49,
 102     41, 33, 26, 18,  3, 11,  4, 12,
 103     19, 27, 34, 42, 50, 58, 35, 43,
 104     51, 59, 20, 28,  5, 13,  6, 14,
 105     21, 29, 36, 44, 52, 60, 37, 45,
 106     53, 61, 22, 30,  7, 15, 23, 31,
 107     38, 46, 54, 62, 39, 47, 55, 63,
 108 };
 109
 110 /* Input permutation for the simple_idct_mmx */
 111 static const uint8_t simple_mmx_permutation[64]={
 112         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 113         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 114         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 115         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 116         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 117         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 118         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 119         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 120 };
 121
 122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 123
 124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 125     int i;
 126     int end;
 127
 128     st->scantable= src_scantable;
 129
 130     for(i=0; i<64; i++){
 131         int j;
 132         j = src_scantable[i];
 133         st->permutated[i] = permutation[j];
 134 #if ARCH_PPC
 135         st->inverse[j] = i;
 136 #endif
 137     }
 138
 139     end=-1;
 140     for(i=0; i<64; i++){
 141         int j;
 142         j = st->permutated[i];
 143         if(j>end) end=j;
 144         st->raster_end[i]= end;
 145     }
 146 }
 147
 148 static int pix_sum_c(uint8_t * pix, int line_size)
 149 {
 150     int s, i, j;
 151
 152     s = 0;
 153     for (i = 0; i < 16; i++) {
 154         for (j = 0; j < 16; j += 8) {
 155             s += pix[0];
 156             s += pix[1];
 157             s += pix[2];
 158             s += pix[3];
 159             s += pix[4];
 160             s += pix[5];
 161             s += pix[6];
 162             s += pix[7];
 163             pix += 8;
 164         }
 165         pix += line_size - 16;
 166     }
 167     return s;
 168 }
 169
 170 static int pix_norm1_c(uint8_t * pix, int line_size)
 171 {
 172     int s, i, j;
 173     uint32_t *sq = ff_squareTbl + 256;
 174
 175     s = 0;
 176     for (i = 0; i < 16; i++) {
 177         for (j = 0; j < 16; j += 8) {
 178 #if 0
 179             s += sq[pix[0]];
 180             s += sq[pix[1]];
 181             s += sq[pix[2]];
 182             s += sq[pix[3]];
 183             s += sq[pix[4]];
 184             s += sq[pix[5]];
 185             s += sq[pix[6]];
 186             s += sq[pix[7]];
 187 #else
 188 #if HAVE_FAST_64BIT
 189             register uint64_t x=*(uint64_t*)pix;
 190             s += sq[x&0xff];
 191             s += sq[(x>>8)&0xff];
 192             s += sq[(x>>16)&0xff];
 193             s += sq[(x>>24)&0xff];
 194             s += sq[(x>>32)&0xff];
 195             s += sq[(x>>40)&0xff];
 196             s += sq[(x>>48)&0xff];
 197             s += sq[(x>>56)&0xff];
 198 #else
 199             register uint32_t x=*(uint32_t*)pix;
 200             s += sq[x&0xff];
 201             s += sq[(x>>8)&0xff];
 202             s += sq[(x>>16)&0xff];
 203             s += sq[(x>>24)&0xff];
 204             x=*(uint32_t*)(pix+4);
 205             s += sq[x&0xff];
 206             s += sq[(x>>8)&0xff];
 207             s += sq[(x>>16)&0xff];
 208             s += sq[(x>>24)&0xff];
 209 #endif
 210 #endif
 211             pix += 8;
 212         }
 213         pix += line_size - 16;
 214     }
 215     return s;
 216 }
 217
 218 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 219     int i;
 220
 221     for(i=0; i+8<=w; i+=8){
 222         dst[i+0]= av_bswap32(src[i+0]);
 223         dst[i+1]= av_bswap32(src[i+1]);
 224         dst[i+2]= av_bswap32(src[i+2]);
 225         dst[i+3]= av_bswap32(src[i+3]);
 226         dst[i+4]= av_bswap32(src[i+4]);
 227         dst[i+5]= av_bswap32(src[i+5]);
 228         dst[i+6]= av_bswap32(src[i+6]);
 229         dst[i+7]= av_bswap32(src[i+7]);
 230     }
 231     for(;i<w; i++){
 232         dst[i+0]= av_bswap32(src[i+0]);
 233     }
 234 }
 235
 236 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 237 {
 238     while (len--)
 239         *dst++ = av_bswap16(*src++);
 240 }
 241
 242 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 243 {
 244     int s, i;
 245     uint32_t *sq = ff_squareTbl + 256;
 246
 247     s = 0;
 248     for (i = 0; i < h; i++) {
 249         s += sq[pix1[0] - pix2[0]];
 250         s += sq[pix1[1] - pix2[1]];
 251         s += sq[pix1[2] - pix2[2]];
 252         s += sq[pix1[3] - pix2[3]];
 253         pix1 += line_size;
 254         pix2 += line_size;
 255     }
 256     return s;
 257 }
 258
 259 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 260 {
 261     int s, i;
 262     uint32_t *sq = ff_squareTbl + 256;
 263
 264     s = 0;
 265     for (i = 0; i < h; i++) {
 266         s += sq[pix1[0] - pix2[0]];
 267         s += sq[pix1[1] - pix2[1]];
 268         s += sq[pix1[2] - pix2[2]];
 269         s += sq[pix1[3] - pix2[3]];
 270         s += sq[pix1[4] - pix2[4]];
 271         s += sq[pix1[5] - pix2[5]];
 272         s += sq[pix1[6] - pix2[6]];
 273         s += sq[pix1[7] - pix2[7]];
 274         pix1 += line_size;
 275         pix2 += line_size;
 276     }
 277     return s;
 278 }
 279
 280 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 281 {
 282     int s, i;
 283     uint32_t *sq = ff_squareTbl + 256;
 284
 285     s = 0;
 286     for (i = 0; i < h; i++) {
 287         s += sq[pix1[ 0] - pix2[ 0]];
 288         s += sq[pix1[ 1] - pix2[ 1]];
 289         s += sq[pix1[ 2] - pix2[ 2]];
 290         s += sq[pix1[ 3] - pix2[ 3]];
 291         s += sq[pix1[ 4] - pix2[ 4]];
 292         s += sq[pix1[ 5] - pix2[ 5]];
 293         s += sq[pix1[ 6] - pix2[ 6]];
 294         s += sq[pix1[ 7] - pix2[ 7]];
 295         s += sq[pix1[ 8] - pix2[ 8]];
 296         s += sq[pix1[ 9] - pix2[ 9]];
 297         s += sq[pix1[10] - pix2[10]];
 298         s += sq[pix1[11] - pix2[11]];
 299         s += sq[pix1[12] - pix2[12]];
 300         s += sq[pix1[13] - pix2[13]];
 301         s += sq[pix1[14] - pix2[14]];
 302         s += sq[pix1[15] - pix2[15]];
 303
 304         pix1 += line_size;
 305         pix2 += line_size;
 306     }
 307     return s;
 308 }
 309
 310 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 311                           const uint8_t *s2, int stride){
 312     int i;
 313
 314     /* read the pixels */
 315     for(i=0;i<8;i++) {
 316         block[0] = s1[0] - s2[0];
 317         block[1] = s1[1] - s2[1];
 318         block[2] = s1[2] - s2[2];
 319         block[3] = s1[3] - s2[3];
 320         block[4] = s1[4] - s2[4];
 321         block[5] = s1[5] - s2[5];
 322         block[6] = s1[6] - s2[6];
 323         block[7] = s1[7] - s2[7];
 324         s1 += stride;
 325         s2 += stride;
 326         block += 8;
 327     }
 328 }
 329
 330
 331 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 332                              int line_size)
 333 {
 334     int i;
 335     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 336
 337     /* read the pixels */
 338     for(i=0;i<8;i++) {
 339         pixels[0] = cm[block[0]];
 340         pixels[1] = cm[block[1]];
 341         pixels[2] = cm[block[2]];
 342         pixels[3] = cm[block[3]];
 343         pixels[4] = cm[block[4]];
 344         pixels[5] = cm[block[5]];
 345         pixels[6] = cm[block[6]];
 346         pixels[7] = cm[block[7]];
 347
 348         pixels += line_size;
 349         block += 8;
 350     }
 351 }
 352
 353 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 354                                  int line_size)
 355 {
 356     int i;
 357     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 358
 359     /* read the pixels */
 360     for(i=0;i<4;i++) {
 361         pixels[0] = cm[block[0]];
 362         pixels[1] = cm[block[1]];
 363         pixels[2] = cm[block[2]];
 364         pixels[3] = cm[block[3]];
 365
 366         pixels += line_size;
 367         block += 8;
 368     }
 369 }
 370
 371 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 372                                  int line_size)
 373 {
 374     int i;
 375     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 376
 377     /* read the pixels */
 378     for(i=0;i<2;i++) {
 379         pixels[0] = cm[block[0]];
 380         pixels[1] = cm[block[1]];
 381
 382         pixels += line_size;
 383         block += 8;
 384     }
 385 }
 386
 387 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
 388                                     uint8_t *restrict pixels,
 389                                     int line_size)
 390 {
 391     int i, j;
 392
 393     for (i = 0; i < 8; i++) {
 394         for (j = 0; j < 8; j++) {
 395             if (*block < -128)
 396                 *pixels = 0;
 397             else if (*block > 127)
 398                 *pixels = 255;
 399             else
 400                 *pixels = (uint8_t)(*block + 128);
 401             block++;
 402             pixels++;
 403         }
 404         pixels += (line_size - 8);
 405     }
 406 }
 407
 408 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 409                              int line_size)
 410 {
 411     int i;
 412     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 413
 414     /* read the pixels */
 415     for(i=0;i<8;i++) {
 416         pixels[0] = cm[pixels[0] + block[0]];
 417         pixels[1] = cm[pixels[1] + block[1]];
 418         pixels[2] = cm[pixels[2] + block[2]];
 419         pixels[3] = cm[pixels[3] + block[3]];
 420         pixels[4] = cm[pixels[4] + block[4]];
 421         pixels[5] = cm[pixels[5] + block[5]];
 422         pixels[6] = cm[pixels[6] + block[6]];
 423         pixels[7] = cm[pixels[7] + block[7]];
 424         pixels += line_size;
 425         block += 8;
 426     }
 427 }
 428
 429 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 430                           int line_size)
 431 {
 432     int i;
 433     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 434
 435     /* read the pixels */
 436     for(i=0;i<4;i++) {
 437         pixels[0] = cm[pixels[0] + block[0]];
 438         pixels[1] = cm[pixels[1] + block[1]];
 439         pixels[2] = cm[pixels[2] + block[2]];
 440         pixels[3] = cm[pixels[3] + block[3]];
 441         pixels += line_size;
 442         block += 8;
 443     }
 444 }
 445
 446 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 447                           int line_size)
 448 {
 449     int i;
 450     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 451
 452     /* read the pixels */
 453     for(i=0;i<2;i++) {
 454         pixels[0] = cm[pixels[0] + block[0]];
 455         pixels[1] = cm[pixels[1] + block[1]];
 456         pixels += line_size;
 457         block += 8;
 458     }
 459 }
 460
 461 static int sum_abs_dctelem_c(DCTELEM *block)
 462 {
 463     int sum=0, i;
 464     for(i=0; i<64; i++)
 465         sum+= FFABS(block[i]);
 466     return sum;
 467 }
 468
 469 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 470 {
 471     int i;
 472
 473     for (i = 0; i < h; i++) {
 474         memset(block, value, 16);
 475         block += line_size;
 476     }
 477 }
 478
 479 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 480 {
 481     int i;
 482
 483     for (i = 0; i < h; i++) {
 484         memset(block, value, 8);
 485         block += line_size;
 486     }
 487 }
 488
 489 #define avg2(a,b) ((a+b+1)>>1)
 490 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 491
 492 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 493 {
 494     const int A=(16-x16)*(16-y16);
 495     const int B=(   x16)*(16-y16);
 496     const int C=(16-x16)*(   y16);
 497     const int D=(   x16)*(   y16);
 498     int i;
 499
 500     for(i=0; i<h; i++)
 501     {
 502         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 503         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 504         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 505         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 506         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 507         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 508         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 509         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 510         dst+= stride;
 511         src+= stride;
 512     }
 513 }
 514
 515 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 516                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 517 {
 518     int y, vx, vy;
 519     const int s= 1<<shift;
 520
 521     width--;
 522     height--;
 523
 524     for(y=0; y<h; y++){
 525         int x;
 526
 527         vx= ox;
 528         vy= oy;
 529         for(x=0; x<8; x++){ //XXX FIXME optimize
 530             int src_x, src_y, frac_x, frac_y, index;
 531
 532             src_x= vx>>16;
 533             src_y= vy>>16;
 534             frac_x= src_x&(s-1);
 535             frac_y= src_y&(s-1);
 536             src_x>>=shift;
 537             src_y>>=shift;
 538
 539             if((unsigned)src_x < width){
 540                 if((unsigned)src_y < height){
 541                     index= src_x + src_y*stride;
 542                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 543                                            + src[index       +1]*   frac_x )*(s-frac_y)
 544                                         + (  src[index+stride  ]*(s-frac_x)
 545                                            + src[index+stride+1]*   frac_x )*   frac_y
 546                                         + r)>>(shift*2);
 547                 }else{
 548                     index= src_x + av_clip(src_y, 0, height)*stride;
 549                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 550                                           + src[index       +1]*   frac_x )*s
 551                                         + r)>>(shift*2);
 552                 }
 553             }else{
 554                 if((unsigned)src_y < height){
 555                     index= av_clip(src_x, 0, width) + src_y*stride;
 556                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 557                                            + src[index+stride  ]*   frac_y )*s
 558                                         + r)>>(shift*2);
 559                 }else{
 560                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 561                     dst[y*stride + x]=    src[index         ];
 562                 }
 563             }
 564
 565             vx+= dxx;
 566             vy+= dyx;
 567         }
 568         ox += dxy;
 569         oy += dyy;
 570     }
 571 }
 572
 573 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 574     switch(width){
 575     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 576     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 577     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 578     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 579     }
 580 }
 581
 582 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 583     int i,j;
 584     for (i=0; i < height; i++) {
 585       for (j=0; j < width; j++) {
 586         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 587       }
 588       src += stride;
 589       dst += stride;
 590     }
 591 }
 592
 593 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 594     int i,j;
 595     for (i=0; i < height; i++) {
 596       for (j=0; j < width; j++) {
 597         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 598       }
 599       src += stride;
 600       dst += stride;
 601     }
 602 }
 603
 604 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 605     int i,j;
 606     for (i=0; i < height; i++) {
 607       for (j=0; j < width; j++) {
 608         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 609       }
 610       src += stride;
 611       dst += stride;
 612     }
 613 }
 614
 615 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 616     int i,j;
 617     for (i=0; i < height; i++) {
 618       for (j=0; j < width; j++) {
 619         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 620       }
 621       src += stride;
 622       dst += stride;
 623     }
 624 }
 625
 626 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 627     int i,j;
 628     for (i=0; i < height; i++) {
 629       for (j=0; j < width; j++) {
 630         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 631       }
 632       src += stride;
 633       dst += stride;
 634     }
 635 }
 636
 637 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 638     int i,j;
 639     for (i=0; i < height; i++) {
 640       for (j=0; j < width; j++) {
 641         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 642       }
 643       src += stride;
 644       dst += stride;
 645     }
 646 }
 647
 648 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 649     int i,j;
 650     for (i=0; i < height; i++) {
 651       for (j=0; j < width; j++) {
 652         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 653       }
 654       src += stride;
 655       dst += stride;
 656     }
 657 }
 658
 659 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 660     int i,j;
 661     for (i=0; i < height; i++) {
 662       for (j=0; j < width; j++) {
 663         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 664       }
 665       src += stride;
 666       dst += stride;
 667     }
 668 }
 669
 670 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 671     switch(width){
 672     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 673     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 674     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 675     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 676     }
 677 }
 678
 679 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 680     int i,j;
 681     for (i=0; i < height; i++) {
 682       for (j=0; j < width; j++) {
 683         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 684       }
 685       src += stride;
 686       dst += stride;
 687     }
 688 }
 689
 690 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 691     int i,j;
 692     for (i=0; i < height; i++) {
 693       for (j=0; j < width; j++) {
 694         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 695       }
 696       src += stride;
 697       dst += stride;
 698     }
 699 }
 700
 701 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 702     int i,j;
 703     for (i=0; i < height; i++) {
 704       for (j=0; j < width; j++) {
 705         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 706       }
 707       src += stride;
 708       dst += stride;
 709     }
 710 }
 711
 712 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 713     int i,j;
 714     for (i=0; i < height; i++) {
 715       for (j=0; j < width; j++) {
 716         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 717       }
 718       src += stride;
 719       dst += stride;
 720     }
 721 }
 722
 723 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 724     int i,j;
 725     for (i=0; i < height; i++) {
 726       for (j=0; j < width; j++) {
 727         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 728       }
 729       src += stride;
 730       dst += stride;
 731     }
 732 }
 733
 734 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 735     int i,j;
 736     for (i=0; i < height; i++) {
 737       for (j=0; j < width; j++) {
 738         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 739       }
 740       src += stride;
 741       dst += stride;
 742     }
 743 }
 744
 745 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 746     int i,j;
 747     for (i=0; i < height; i++) {
 748       for (j=0; j < width; j++) {
 749         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 750       }
 751       src += stride;
 752       dst += stride;
 753     }
 754 }
 755
 756 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 757     int i,j;
 758     for (i=0; i < height; i++) {
 759       for (j=0; j < width; j++) {
 760         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 761       }
 762       src += stride;
 763       dst += stride;
 764     }
 765 }
 766
 767 #define QPEL_MC(r, OPNAME, RND, OP) \
 768 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 769     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 770     int i;\
 771     for(i=0; i<h; i++)\
 772     {\
 773         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 774         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 775         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 776         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 777         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 778         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 779         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 780         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 781         dst+=dstStride;\
 782         src+=srcStride;\
 783     }\
 784 }\
 785 \
 786 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 787     const int w=8;\
 788     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 789     int i;\
 790     for(i=0; i<w; i++)\
 791     {\
 792         const int src0= src[0*srcStride];\
 793         const int src1= src[1*srcStride];\
 794         const int src2= src[2*srcStride];\
 795         const int src3= src[3*srcStride];\
 796         const int src4= src[4*srcStride];\
 797         const int src5= src[5*srcStride];\
 798         const int src6= src[6*srcStride];\
 799         const int src7= src[7*srcStride];\
 800         const int src8= src[8*srcStride];\
 801         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 802         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 803         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 804         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 805         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 806         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 807         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 808         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 809         dst++;\
 810         src++;\
 811     }\
 812 }\
 813 \
 814 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 815     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 816     int i;\
 817     \
 818     for(i=0; i<h; i++)\
 819     {\
 820         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 821         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 822         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 823         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 824         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 825         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 826         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 827         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 828         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 829         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 830         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 831         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 832         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 833         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 834         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 835         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 836         dst+=dstStride;\
 837         src+=srcStride;\
 838     }\
 839 }\
 840 \
 841 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 842     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 843     int i;\
 844     const int w=16;\
 845     for(i=0; i<w; i++)\
 846     {\
 847         const int src0= src[0*srcStride];\
 848         const int src1= src[1*srcStride];\
 849         const int src2= src[2*srcStride];\
 850         const int src3= src[3*srcStride];\
 851         const int src4= src[4*srcStride];\
 852         const int src5= src[5*srcStride];\
 853         const int src6= src[6*srcStride];\
 854         const int src7= src[7*srcStride];\
 855         const int src8= src[8*srcStride];\
 856         const int src9= src[9*srcStride];\
 857         const int src10= src[10*srcStride];\
 858         const int src11= src[11*srcStride];\
 859         const int src12= src[12*srcStride];\
 860         const int src13= src[13*srcStride];\
 861         const int src14= src[14*srcStride];\
 862         const int src15= src[15*srcStride];\
 863         const int src16= src[16*srcStride];\
 864         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 865         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 866         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 867         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 868         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 869         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 870         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 871         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 872         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 873         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 874         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 875         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 876         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 877         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 878         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 879         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 880         dst++;\
 881         src++;\
 882     }\
 883 }\
 884 \
 885 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
 886     uint8_t half[64];\
 887     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 888     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 889 }\
 890 \
 891 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
 892     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 893 }\
 894 \
 895 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
 896     uint8_t half[64];\
 897     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 898     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 899 }\
 900 \
 901 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
 902     uint8_t full[16*9];\
 903     uint8_t half[64];\
 904     copy_block9(full, src, 16, stride, 9);\
 905     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 906     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 907 }\
 908 \
 909 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
 910     uint8_t full[16*9];\
 911     copy_block9(full, src, 16, stride, 9);\
 912     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 913 }\
 914 \
 915 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
 916     uint8_t full[16*9];\
 917     uint8_t half[64];\
 918     copy_block9(full, src, 16, stride, 9);\
 919     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 920     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 921 }\
 922 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
 923     uint8_t full[16*9];\
 924     uint8_t halfH[72];\
 925     uint8_t halfV[64];\
 926     uint8_t halfHV[64];\
 927     copy_block9(full, src, 16, stride, 9);\
 928     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 929     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 930     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 931     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 932 }\
 933 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
 934     uint8_t full[16*9];\
 935     uint8_t halfH[72];\
 936     uint8_t halfHV[64];\
 937     copy_block9(full, src, 16, stride, 9);\
 938     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 939     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 940     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 941     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 942 }\
 943 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
 944     uint8_t full[16*9];\
 945     uint8_t halfH[72];\
 946     uint8_t halfV[64];\
 947     uint8_t halfHV[64];\
 948     copy_block9(full, src, 16, stride, 9);\
 949     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 950     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 951     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 952     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 953 }\
 954 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
 955     uint8_t full[16*9];\
 956     uint8_t halfH[72];\
 957     uint8_t halfHV[64];\
 958     copy_block9(full, src, 16, stride, 9);\
 959     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 960     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 961     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 962     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 963 }\
 964 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
 965     uint8_t full[16*9];\
 966     uint8_t halfH[72];\
 967     uint8_t halfV[64];\
 968     uint8_t halfHV[64];\
 969     copy_block9(full, src, 16, stride, 9);\
 970     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 971     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 972     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 973     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 974 }\
 975 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
 976     uint8_t full[16*9];\
 977     uint8_t halfH[72];\
 978     uint8_t halfHV[64];\
 979     copy_block9(full, src, 16, stride, 9);\
 980     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 981     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 982     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 983     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 984 }\
 985 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
 986     uint8_t full[16*9];\
 987     uint8_t halfH[72];\
 988     uint8_t halfV[64];\
 989     uint8_t halfHV[64];\
 990     copy_block9(full, src, 16, stride, 9);\
 991     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
 992     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 993     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 994     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 995 }\
 996 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
 997     uint8_t full[16*9];\
 998     uint8_t halfH[72];\
 999     uint8_t halfHV[64];\
1000     copy_block9(full, src, 16, stride, 9);\
1001     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1002     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1003     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1004     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1005 }\
1006 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1007     uint8_t halfH[72];\
1008     uint8_t halfHV[64];\
1009     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1010     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1011     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1012 }\
1013 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1014     uint8_t halfH[72];\
1015     uint8_t halfHV[64];\
1016     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1017     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1019 }\
1020 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1021     uint8_t full[16*9];\
1022     uint8_t halfH[72];\
1023     uint8_t halfV[64];\
1024     uint8_t halfHV[64];\
1025     copy_block9(full, src, 16, stride, 9);\
1026     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1027     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1028     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1029     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1030 }\
1031 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1032     uint8_t full[16*9];\
1033     uint8_t halfH[72];\
1034     copy_block9(full, src, 16, stride, 9);\
1035     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1036     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1037     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1038 }\
1039 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1040     uint8_t full[16*9];\
1041     uint8_t halfH[72];\
1042     uint8_t halfV[64];\
1043     uint8_t halfHV[64];\
1044     copy_block9(full, src, 16, stride, 9);\
1045     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1046     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1047     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1048     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1049 }\
1050 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1051     uint8_t full[16*9];\
1052     uint8_t halfH[72];\
1053     copy_block9(full, src, 16, stride, 9);\
1054     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1055     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1056     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1057 }\
1058 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1059     uint8_t halfH[72];\
1060     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1061     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1062 }\
1063 \
1064 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1065     uint8_t half[256];\
1066     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1067     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1068 }\
1069 \
1070 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1071     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1072 }\
1073 \
1074 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1075     uint8_t half[256];\
1076     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1077     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1078 }\
1079 \
1080 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1081     uint8_t full[24*17];\
1082     uint8_t half[256];\
1083     copy_block17(full, src, 24, stride, 17);\
1084     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1085     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1086 }\
1087 \
1088 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1089     uint8_t full[24*17];\
1090     copy_block17(full, src, 24, stride, 17);\
1091     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1092 }\
1093 \
1094 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1095     uint8_t full[24*17];\
1096     uint8_t half[256];\
1097     copy_block17(full, src, 24, stride, 17);\
1098     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1099     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1100 }\
1101 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1102     uint8_t full[24*17];\
1103     uint8_t halfH[272];\
1104     uint8_t halfV[256];\
1105     uint8_t halfHV[256];\
1106     copy_block17(full, src, 24, stride, 17);\
1107     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1108     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1109     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1110     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1111 }\
1112 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1113     uint8_t full[24*17];\
1114     uint8_t halfH[272];\
1115     uint8_t halfHV[256];\
1116     copy_block17(full, src, 24, stride, 17);\
1117     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1118     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1119     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1120     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1121 }\
1122 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1123     uint8_t full[24*17];\
1124     uint8_t halfH[272];\
1125     uint8_t halfV[256];\
1126     uint8_t halfHV[256];\
1127     copy_block17(full, src, 24, stride, 17);\
1128     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1129     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1130     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1131     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1132 }\
1133 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1134     uint8_t full[24*17];\
1135     uint8_t halfH[272];\
1136     uint8_t halfHV[256];\
1137     copy_block17(full, src, 24, stride, 17);\
1138     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1139     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1140     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1141     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1142 }\
1143 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1144     uint8_t full[24*17];\
1145     uint8_t halfH[272];\
1146     uint8_t halfV[256];\
1147     uint8_t halfHV[256];\
1148     copy_block17(full, src, 24, stride, 17);\
1149     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1150     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1151     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1152     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1153 }\
1154 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1155     uint8_t full[24*17];\
1156     uint8_t halfH[272];\
1157     uint8_t halfHV[256];\
1158     copy_block17(full, src, 24, stride, 17);\
1159     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1160     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1161     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1162     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1163 }\
1164 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1165     uint8_t full[24*17];\
1166     uint8_t halfH[272];\
1167     uint8_t halfV[256];\
1168     uint8_t halfHV[256];\
1169     copy_block17(full, src, 24, stride, 17);\
1170     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1171     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1172     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1173     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1174 }\
1175 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1176     uint8_t full[24*17];\
1177     uint8_t halfH[272];\
1178     uint8_t halfHV[256];\
1179     copy_block17(full, src, 24, stride, 17);\
1180     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1181     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1182     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1183     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1184 }\
1185 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1186     uint8_t halfH[272];\
1187     uint8_t halfHV[256];\
1188     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1189     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1190     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1191 }\
1192 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1193     uint8_t halfH[272];\
1194     uint8_t halfHV[256];\
1195     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1196     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1197     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1198 }\
1199 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1200     uint8_t full[24*17];\
1201     uint8_t halfH[272];\
1202     uint8_t halfV[256];\
1203     uint8_t halfHV[256];\
1204     copy_block17(full, src, 24, stride, 17);\
1205     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1206     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1207     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1208     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1209 }\
1210 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1211     uint8_t full[24*17];\
1212     uint8_t halfH[272];\
1213     copy_block17(full, src, 24, stride, 17);\
1214     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1215     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1216     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1217 }\
1218 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1219     uint8_t full[24*17];\
1220     uint8_t halfH[272];\
1221     uint8_t halfV[256];\
1222     uint8_t halfHV[256];\
1223     copy_block17(full, src, 24, stride, 17);\
1224     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1225     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1226     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1227     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1228 }\
1229 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1230     uint8_t full[24*17];\
1231     uint8_t halfH[272];\
1232     copy_block17(full, src, 24, stride, 17);\
1233     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1234     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1235     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1236 }\
1237 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1238     uint8_t halfH[272];\
1239     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1240     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1241 }
1242
1243 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1244 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1245 #define op_put(a, b) a = cm[((b) + 16)>>5]
1246 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1247
1248 QPEL_MC(0, put_       , _       , op_put)
1249 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1250 QPEL_MC(0, avg_       , _       , op_avg)
1251 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1252 #undef op_avg
1253 #undef op_avg_no_rnd
1254 #undef op_put
1255 #undef op_put_no_rnd
1256
1257 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1258 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1259 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1260 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1261 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1262 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1263
1264 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1265     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1266     int i;
1267
1268     for(i=0; i<h; i++){
1269         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1270         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1271         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1272         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1273         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1274         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1275         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1276         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1277         dst+=dstStride;
1278         src+=srcStride;
1279     }
1280 }
1281
1282 #if CONFIG_RV40_DECODER
1283 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1284     put_pixels16_xy2_8_c(dst, src, stride, 16);
1285 }
1286 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1287     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1288 }
1289 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1290     put_pixels8_xy2_8_c(dst, src, stride, 8);
1291 }
1292 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1293     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1294 }
1295 #endif /* CONFIG_RV40_DECODER */
1296
1297 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1298     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1299     int i;
1300
1301     for(i=0; i<w; i++){
1302         const int src_1= src[ -srcStride];
1303         const int src0 = src[0          ];
1304         const int src1 = src[  srcStride];
1305         const int src2 = src[2*srcStride];
1306         const int src3 = src[3*srcStride];
1307         const int src4 = src[4*srcStride];
1308         const int src5 = src[5*srcStride];
1309         const int src6 = src[6*srcStride];
1310         const int src7 = src[7*srcStride];
1311         const int src8 = src[8*srcStride];
1312         const int src9 = src[9*srcStride];
1313         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1314         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1315         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1316         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1317         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1318         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1319         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1320         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1321         src++;
1322         dst++;
1323     }
1324 }
1325
1326 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1327     uint8_t half[64];
1328     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1329     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1330 }
1331
1332 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1333     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1334 }
1335
1336 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1337     uint8_t half[64];
1338     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1339     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1340 }
1341
1342 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1343     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1344 }
1345
1346 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1347     uint8_t halfH[88];
1348     uint8_t halfV[64];
1349     uint8_t halfHV[64];
1350     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1351     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1352     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1353     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1354 }
1355 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1356     uint8_t halfH[88];
1357     uint8_t halfV[64];
1358     uint8_t halfHV[64];
1359     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1360     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1361     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1362     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1363 }
1364 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1365     uint8_t halfH[88];
1366     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1367     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1368 }
1369
1370 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1371     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1372     int x;
1373     const int strength= ff_h263_loop_filter_strength[qscale];
1374
1375     for(x=0; x<8; x++){
1376         int d1, d2, ad1;
1377         int p0= src[x-2*stride];
1378         int p1= src[x-1*stride];
1379         int p2= src[x+0*stride];
1380         int p3= src[x+1*stride];
1381         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1382
1383         if     (d<-2*strength) d1= 0;
1384         else if(d<-  strength) d1=-2*strength - d;
1385         else if(d<   strength) d1= d;
1386         else if(d< 2*strength) d1= 2*strength - d;
1387         else                   d1= 0;
1388
1389         p1 += d1;
1390         p2 -= d1;
1391         if(p1&256) p1= ~(p1>>31);
1392         if(p2&256) p2= ~(p2>>31);
1393
1394         src[x-1*stride] = p1;
1395         src[x+0*stride] = p2;
1396
1397         ad1= FFABS(d1)>>1;
1398
1399         d2= av_clip((p0-p3)/4, -ad1, ad1);
1400
1401         src[x-2*stride] = p0 - d2;
1402         src[x+  stride] = p3 + d2;
1403     }
1404     }
1405 }
1406
1407 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1408     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1409     int y;
1410     const int strength= ff_h263_loop_filter_strength[qscale];
1411
1412     for(y=0; y<8; y++){
1413         int d1, d2, ad1;
1414         int p0= src[y*stride-2];
1415         int p1= src[y*stride-1];
1416         int p2= src[y*stride+0];
1417         int p3= src[y*stride+1];
1418         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1419
1420         if     (d<-2*strength) d1= 0;
1421         else if(d<-  strength) d1=-2*strength - d;
1422         else if(d<   strength) d1= d;
1423         else if(d< 2*strength) d1= 2*strength - d;
1424         else                   d1= 0;
1425
1426         p1 += d1;
1427         p2 -= d1;
1428         if(p1&256) p1= ~(p1>>31);
1429         if(p2&256) p2= ~(p2>>31);
1430
1431         src[y*stride-1] = p1;
1432         src[y*stride+0] = p2;
1433
1434         ad1= FFABS(d1)>>1;
1435
1436         d2= av_clip((p0-p3)/4, -ad1, ad1);
1437
1438         src[y*stride-2] = p0 - d2;
1439         src[y*stride+1] = p3 + d2;
1440     }
1441     }
1442 }
1443
1444 static void h261_loop_filter_c(uint8_t *src, int stride){
1445     int x,y,xy,yz;
1446     int temp[64];
1447
1448     for(x=0; x<8; x++){
1449         temp[x      ] = 4*src[x           ];
1450         temp[x + 7*8] = 4*src[x + 7*stride];
1451     }
1452     for(y=1; y<7; y++){
1453         for(x=0; x<8; x++){
1454             xy = y * stride + x;
1455             yz = y * 8 + x;
1456             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1457         }
1458     }
1459
1460     for(y=0; y<8; y++){
1461         src[  y*stride] = (temp[  y*8] + 2)>>2;
1462         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1463         for(x=1; x<7; x++){
1464             xy = y * stride + x;
1465             yz = y * 8 + x;
1466             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1467         }
1468     }
1469 }
1470
1471 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1472 {
1473     int s, i;
1474
1475     s = 0;
1476     for(i=0;i<h;i++) {
1477         s += abs(pix1[0] - pix2[0]);
1478         s += abs(pix1[1] - pix2[1]);
1479         s += abs(pix1[2] - pix2[2]);
1480         s += abs(pix1[3] - pix2[3]);
1481         s += abs(pix1[4] - pix2[4]);
1482         s += abs(pix1[5] - pix2[5]);
1483         s += abs(pix1[6] - pix2[6]);
1484         s += abs(pix1[7] - pix2[7]);
1485         s += abs(pix1[8] - pix2[8]);
1486         s += abs(pix1[9] - pix2[9]);
1487         s += abs(pix1[10] - pix2[10]);
1488         s += abs(pix1[11] - pix2[11]);
1489         s += abs(pix1[12] - pix2[12]);
1490         s += abs(pix1[13] - pix2[13]);
1491         s += abs(pix1[14] - pix2[14]);
1492         s += abs(pix1[15] - pix2[15]);
1493         pix1 += line_size;
1494         pix2 += line_size;
1495     }
1496     return s;
1497 }
1498
1499 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1500 {
1501     int s, i;
1502
1503     s = 0;
1504     for(i=0;i<h;i++) {
1505         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1506         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1507         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1508         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1509         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1510         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1511         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1512         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1513         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1514         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1515         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1516         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1517         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1518         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1519         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1520         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1521         pix1 += line_size;
1522         pix2 += line_size;
1523     }
1524     return s;
1525 }
1526
1527 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1528 {
1529     int s, i;
1530     uint8_t *pix3 = pix2 + line_size;
1531
1532     s = 0;
1533     for(i=0;i<h;i++) {
1534         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1535         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1536         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1537         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1538         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1539         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1540         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1541         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1542         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1543         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1544         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1545         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1546         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1547         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1548         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1549         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1550         pix1 += line_size;
1551         pix2 += line_size;
1552         pix3 += line_size;
1553     }
1554     return s;
1555 }
1556
1557 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1558 {
1559     int s, i;
1560     uint8_t *pix3 = pix2 + line_size;
1561
1562     s = 0;
1563     for(i=0;i<h;i++) {
1564         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1565         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1566         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1567         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1568         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1569         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1570         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1571         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1572         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1573         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1574         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1575         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1576         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1577         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1578         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1579         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1580         pix1 += line_size;
1581         pix2 += line_size;
1582         pix3 += line_size;
1583     }
1584     return s;
1585 }
1586
1587 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1588 {
1589     int s, i;
1590
1591     s = 0;
1592     for(i=0;i<h;i++) {
1593         s += abs(pix1[0] - pix2[0]);
1594         s += abs(pix1[1] - pix2[1]);
1595         s += abs(pix1[2] - pix2[2]);
1596         s += abs(pix1[3] - pix2[3]);
1597         s += abs(pix1[4] - pix2[4]);
1598         s += abs(pix1[5] - pix2[5]);
1599         s += abs(pix1[6] - pix2[6]);
1600         s += abs(pix1[7] - pix2[7]);
1601         pix1 += line_size;
1602         pix2 += line_size;
1603     }
1604     return s;
1605 }
1606
1607 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1608 {
1609     int s, i;
1610
1611     s = 0;
1612     for(i=0;i<h;i++) {
1613         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1614         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1615         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1616         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1617         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1618         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1619         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1620         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1621         pix1 += line_size;
1622         pix2 += line_size;
1623     }
1624     return s;
1625 }
1626
1627 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1628 {
1629     int s, i;
1630     uint8_t *pix3 = pix2 + line_size;
1631
1632     s = 0;
1633     for(i=0;i<h;i++) {
1634         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1635         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1636         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1637         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1638         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1639         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1640         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1641         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1642         pix1 += line_size;
1643         pix2 += line_size;
1644         pix3 += line_size;
1645     }
1646     return s;
1647 }
1648
1649 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1650 {
1651     int s, i;
1652     uint8_t *pix3 = pix2 + line_size;
1653
1654     s = 0;
1655     for(i=0;i<h;i++) {
1656         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1657         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1658         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1659         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1660         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1661         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1662         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1663         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1664         pix1 += line_size;
1665         pix2 += line_size;
1666         pix3 += line_size;
1667     }
1668     return s;
1669 }
1670
1671 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1672     MpegEncContext *c = v;
1673     int score1=0;
1674     int score2=0;
1675     int x,y;
1676
1677     for(y=0; y<h; y++){
1678         for(x=0; x<16; x++){
1679             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1680         }
1681         if(y+1<h){
1682             for(x=0; x<15; x++){
1683                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1684                              - s1[x+1] + s1[x+1+stride])
1685                         -FFABS(  s2[x  ] - s2[x  +stride]
1686                              - s2[x+1] + s2[x+1+stride]);
1687             }
1688         }
1689         s1+= stride;
1690         s2+= stride;
1691     }
1692
1693     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1694     else  return score1 + FFABS(score2)*8;
1695 }
1696
1697 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1698     MpegEncContext *c = v;
1699     int score1=0;
1700     int score2=0;
1701     int x,y;
1702
1703     for(y=0; y<h; y++){
1704         for(x=0; x<8; x++){
1705             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1706         }
1707         if(y+1<h){
1708             for(x=0; x<7; x++){
1709                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1710                              - s1[x+1] + s1[x+1+stride])
1711                         -FFABS(  s2[x  ] - s2[x  +stride]
1712                              - s2[x+1] + s2[x+1+stride]);
1713             }
1714         }
1715         s1+= stride;
1716         s2+= stride;
1717     }
1718
1719     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1720     else  return score1 + FFABS(score2)*8;
1721 }
1722
1723 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1724     int i;
1725     unsigned int sum=0;
1726
1727     for(i=0; i<8*8; i++){
1728         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1729         int w= weight[i];
1730         b>>= RECON_SHIFT;
1731         assert(-512<b && b<512);
1732
1733         sum += (w*b)*(w*b)>>4;
1734     }
1735     return sum>>2;
1736 }
1737
1738 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1739     int i;
1740
1741     for(i=0; i<8*8; i++){
1742         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1743     }
1744 }
1745
1746 /**
1747  * permutes an 8x8 block.
1748  * @param block the block which will be permuted according to the given permutation vector
1749  * @param permutation the permutation vector
1750  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1751  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1752  *                  (inverse) permutated to scantable order!
1753  */
1754 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1755 {
1756     int i;
1757     DCTELEM temp[64];
1758
1759     if(last<=0) return;
1760     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1761
1762     for(i=0; i<=last; i++){
1763         const int j= scantable[i];
1764         temp[j]= block[j];
1765         block[j]=0;
1766     }
1767
1768     for(i=0; i<=last; i++){
1769         const int j= scantable[i];
1770         const int perm_j= permutation[j];
1771         block[perm_j]= temp[j];
1772     }
1773 }
1774
1775 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1776     return 0;
1777 }
1778
1779 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1780     int i;
1781
1782     memset(cmp, 0, sizeof(void*)*6);
1783
1784     for(i=0; i<6; i++){
1785         switch(type&0xFF){
1786         case FF_CMP_SAD:
1787             cmp[i]= c->sad[i];
1788             break;
1789         case FF_CMP_SATD:
1790             cmp[i]= c->hadamard8_diff[i];
1791             break;
1792         case FF_CMP_SSE:
1793             cmp[i]= c->sse[i];
1794             break;
1795         case FF_CMP_DCT:
1796             cmp[i]= c->dct_sad[i];
1797             break;
1798         case FF_CMP_DCT264:
1799             cmp[i]= c->dct264_sad[i];
1800             break;
1801         case FF_CMP_DCTMAX:
1802             cmp[i]= c->dct_max[i];
1803             break;
1804         case FF_CMP_PSNR:
1805             cmp[i]= c->quant_psnr[i];
1806             break;
1807         case FF_CMP_BIT:
1808             cmp[i]= c->bit[i];
1809             break;
1810         case FF_CMP_RD:
1811             cmp[i]= c->rd[i];
1812             break;
1813         case FF_CMP_VSAD:
1814             cmp[i]= c->vsad[i];
1815             break;
1816         case FF_CMP_VSSE:
1817             cmp[i]= c->vsse[i];
1818             break;
1819         case FF_CMP_ZERO:
1820             cmp[i]= zero_cmp;
1821             break;
1822         case FF_CMP_NSSE:
1823             cmp[i]= c->nsse[i];
1824             break;
1825 #if CONFIG_DWT
1826         case FF_CMP_W53:
1827             cmp[i]= c->w53[i];
1828             break;
1829         case FF_CMP_W97:
1830             cmp[i]= c->w97[i];
1831             break;
1832 #endif
1833         default:
1834             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1835         }
1836     }
1837 }
1838
1839 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1840     long i;
1841     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1842         long a = *(long*)(src+i);
1843         long b = *(long*)(dst+i);
1844         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1845     }
1846     for(; i<w; i++)
1847         dst[i+0] += src[i+0];
1848 }
1849
1850 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1851     long i;
1852     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1853         long a = *(long*)(src1+i);
1854         long b = *(long*)(src2+i);
1855         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1856     }
1857     for(; i<w; i++)
1858         dst[i] = src1[i]+src2[i];
1859 }
1860
1861 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1862     long i;
1863 #if !HAVE_FAST_UNALIGNED
1864     if((long)src2 & (sizeof(long)-1)){
1865         for(i=0; i+7<w; i+=8){
1866             dst[i+0] = src1[i+0]-src2[i+0];
1867             dst[i+1] = src1[i+1]-src2[i+1];
1868             dst[i+2] = src1[i+2]-src2[i+2];
1869             dst[i+3] = src1[i+3]-src2[i+3];
1870             dst[i+4] = src1[i+4]-src2[i+4];
1871             dst[i+5] = src1[i+5]-src2[i+5];
1872             dst[i+6] = src1[i+6]-src2[i+6];
1873             dst[i+7] = src1[i+7]-src2[i+7];
1874         }
1875     }else
1876 #endif
1877     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1878         long a = *(long*)(src1+i);
1879         long b = *(long*)(src2+i);
1880         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1881     }
1882     for(; i<w; i++)
1883         dst[i+0] = src1[i+0]-src2[i+0];
1884 }
1885
1886 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1887     int i;
1888     uint8_t l, lt;
1889
1890     l= *left;
1891     lt= *left_top;
1892
1893     for(i=0; i<w; i++){
1894         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1895         lt= src1[i];
1896         dst[i]= l;
1897     }
1898
1899     *left= l;
1900     *left_top= lt;
1901 }
1902
1903 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1904     int i;
1905     uint8_t l, lt;
1906
1907     l= *left;
1908     lt= *left_top;
1909
1910     for(i=0; i<w; i++){
1911         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1912         lt= src1[i];
1913         l= src2[i];
1914         dst[i]= l - pred;
1915     }
1916
1917     *left= l;
1918     *left_top= lt;
1919 }
1920
1921 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1922     int i;
1923
1924     for(i=0; i<w-1; i++){
1925         acc+= src[i];
1926         dst[i]= acc;
1927         i++;
1928         acc+= src[i];
1929         dst[i]= acc;
1930     }
1931
1932     for(; i<w; i++){
1933         acc+= src[i];
1934         dst[i]= acc;
1935     }
1936
1937     return acc;
1938 }
1939
1940 #if HAVE_BIGENDIAN
1941 #define B 3
1942 #define G 2
1943 #define R 1
1944 #define A 0
1945 #else
1946 #define B 0
1947 #define G 1
1948 #define R 2
1949 #define A 3
1950 #endif
1951 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1952     int i;
1953     int r,g,b,a;
1954     r= *red;
1955     g= *green;
1956     b= *blue;
1957     a= *alpha;
1958
1959     for(i=0; i<w; i++){
1960         b+= src[4*i+B];
1961         g+= src[4*i+G];
1962         r+= src[4*i+R];
1963         a+= src[4*i+A];
1964
1965         dst[4*i+B]= b;
1966         dst[4*i+G]= g;
1967         dst[4*i+R]= r;
1968         dst[4*i+A]= a;
1969     }
1970
1971     *red= r;
1972     *green= g;
1973     *blue= b;
1974     *alpha= a;
1975 }
1976 #undef B
1977 #undef G
1978 #undef R
1979 #undef A
1980
1981 #define BUTTERFLY2(o1,o2,i1,i2) \
1982 o1= (i1)+(i2);\
1983 o2= (i1)-(i2);
1984
1985 #define BUTTERFLY1(x,y) \
1986 {\
1987     int a,b;\
1988     a= x;\
1989     b= y;\
1990     x= a+b;\
1991     y= a-b;\
1992 }
1993
1994 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1995
1996 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1997     int i;
1998     int temp[64];
1999     int sum=0;
2000
2001     assert(h==8);
2002
2003     for(i=0; i<8; i++){
2004         //FIXME try pointer walks
2005         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2006         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2007         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2008         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2009
2010         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2011         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2012         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2013         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2014
2015         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2016         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2017         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2018         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2019     }
2020
2021     for(i=0; i<8; i++){
2022         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2023         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2024         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2025         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2026
2027         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2028         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2029         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2030         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2031
2032         sum +=
2033              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2034             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2035             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2036             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2037     }
2038     return sum;
2039 }
2040
2041 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2042     int i;
2043     int temp[64];
2044     int sum=0;
2045
2046     assert(h==8);
2047
2048     for(i=0; i<8; i++){
2049         //FIXME try pointer walks
2050         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2051         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2052         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2053         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2054
2055         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2056         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2057         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2058         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2059
2060         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2061         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2062         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2063         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2064     }
2065
2066     for(i=0; i<8; i++){
2067         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2068         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2069         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2070         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2071
2072         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2073         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2074         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2075         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2076
2077         sum +=
2078              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2079             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2080             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2081             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2082     }
2083
2084     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2085
2086     return sum;
2087 }
2088
2089 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2090     MpegEncContext * const s= (MpegEncContext *)c;
2091     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2092
2093     assert(h==8);
2094
2095     s->dsp.diff_pixels(temp, src1, src2, stride);
2096     s->dsp.fdct(temp);
2097     return s->dsp.sum_abs_dctelem(temp);
2098 }
2099
2100 #if CONFIG_GPL
2101 #define DCT8_1D {\
2102     const int s07 = SRC(0) + SRC(7);\
2103     const int s16 = SRC(1) + SRC(6);\
2104     const int s25 = SRC(2) + SRC(5);\
2105     const int s34 = SRC(3) + SRC(4);\
2106     const int a0 = s07 + s34;\
2107     const int a1 = s16 + s25;\
2108     const int a2 = s07 - s34;\
2109     const int a3 = s16 - s25;\
2110     const int d07 = SRC(0) - SRC(7);\
2111     const int d16 = SRC(1) - SRC(6);\
2112     const int d25 = SRC(2) - SRC(5);\
2113     const int d34 = SRC(3) - SRC(4);\
2114     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2115     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2116     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2117     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2118     DST(0,  a0 + a1     ) ;\
2119     DST(1,  a4 + (a7>>2)) ;\
2120     DST(2,  a2 + (a3>>1)) ;\
2121     DST(3,  a5 + (a6>>2)) ;\
2122     DST(4,  a0 - a1     ) ;\
2123     DST(5,  a6 - (a5>>2)) ;\
2124     DST(6, (a2>>1) - a3 ) ;\
2125     DST(7, (a4>>2) - a7 ) ;\
2126 }
2127
2128 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2129     MpegEncContext * const s= (MpegEncContext *)c;
2130     DCTELEM dct[8][8];
2131     int i;
2132     int sum=0;
2133
2134     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2135
2136 #define SRC(x) dct[i][x]
2137 #define DST(x,v) dct[i][x]= v
2138     for( i = 0; i < 8; i++ )
2139         DCT8_1D
2140 #undef SRC
2141 #undef DST
2142
2143 #define SRC(x) dct[x][i]
2144 #define DST(x,v) sum += FFABS(v)
2145     for( i = 0; i < 8; i++ )
2146         DCT8_1D
2147 #undef SRC
2148 #undef DST
2149     return sum;
2150 }
2151 #endif
2152
2153 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2154     MpegEncContext * const s= (MpegEncContext *)c;
2155     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2156     int sum=0, i;
2157
2158     assert(h==8);
2159
2160     s->dsp.diff_pixels(temp, src1, src2, stride);
2161     s->dsp.fdct(temp);
2162
2163     for(i=0; i<64; i++)
2164         sum= FFMAX(sum, FFABS(temp[i]));
2165
2166     return sum;
2167 }
2168
2169 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2170     MpegEncContext * const s= (MpegEncContext *)c;
2171     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2172     DCTELEM * const bak = temp+64;
2173     int sum=0, i;
2174
2175     assert(h==8);
2176     s->mb_intra=0;
2177
2178     s->dsp.diff_pixels(temp, src1, src2, stride);
2179
2180     memcpy(bak, temp, 64*sizeof(DCTELEM));
2181
2182     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2183     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2184     ff_simple_idct_8(temp); //FIXME
2185
2186     for(i=0; i<64; i++)
2187         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2188
2189     return sum;
2190 }
2191
2192 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2193     MpegEncContext * const s= (MpegEncContext *)c;
2194     const uint8_t *scantable= s->intra_scantable.permutated;
2195     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2196     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2197     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2198     int i, last, run, bits, level, distortion, start_i;
2199     const int esc_length= s->ac_esc_length;
2200     uint8_t * length;
2201     uint8_t * last_length;
2202
2203     assert(h==8);
2204
2205     copy_block8(lsrc1, src1, 8, stride, 8);
2206     copy_block8(lsrc2, src2, 8, stride, 8);
2207
2208     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2209
2210     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2211
2212     bits=0;
2213
2214     if (s->mb_intra) {
2215         start_i = 1;
2216         length     = s->intra_ac_vlc_length;
2217         last_length= s->intra_ac_vlc_last_length;
2218         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2219     } else {
2220         start_i = 0;
2221         length     = s->inter_ac_vlc_length;
2222         last_length= s->inter_ac_vlc_last_length;
2223     }
2224
2225     if(last>=start_i){
2226         run=0;
2227         for(i=start_i; i<last; i++){
2228             int j= scantable[i];
2229             level= temp[j];
2230
2231             if(level){
2232                 level+=64;
2233                 if((level&(~127)) == 0){
2234                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2235                 }else
2236                     bits+= esc_length;
2237                 run=0;
2238             }else
2239                 run++;
2240         }
2241         i= scantable[last];
2242
2243         level= temp[i] + 64;
2244
2245         assert(level - 64);
2246
2247         if((level&(~127)) == 0){
2248             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2249         }else
2250             bits+= esc_length;
2251
2252     }
2253
2254     if(last>=0){
2255         if(s->mb_intra)
2256             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2257         else
2258             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2259     }
2260
2261     s->dsp.idct_add(lsrc2, 8, temp);
2262
2263     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2264
2265     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2266 }
2267
2268 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2269     MpegEncContext * const s= (MpegEncContext *)c;
2270     const uint8_t *scantable= s->intra_scantable.permutated;
2271     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2272     int i, last, run, bits, level, start_i;
2273     const int esc_length= s->ac_esc_length;
2274     uint8_t * length;
2275     uint8_t * last_length;
2276
2277     assert(h==8);
2278
2279     s->dsp.diff_pixels(temp, src1, src2, stride);
2280
2281     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2282
2283     bits=0;
2284
2285     if (s->mb_intra) {
2286         start_i = 1;
2287         length     = s->intra_ac_vlc_length;
2288         last_length= s->intra_ac_vlc_last_length;
2289         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2290     } else {
2291         start_i = 0;
2292         length     = s->inter_ac_vlc_length;
2293         last_length= s->inter_ac_vlc_last_length;
2294     }
2295
2296     if(last>=start_i){
2297         run=0;
2298         for(i=start_i; i<last; i++){
2299             int j= scantable[i];
2300             level= temp[j];
2301
2302             if(level){
2303                 level+=64;
2304                 if((level&(~127)) == 0){
2305                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2306                 }else
2307                     bits+= esc_length;
2308                 run=0;
2309             }else
2310                 run++;
2311         }
2312         i= scantable[last];
2313
2314         level= temp[i] + 64;
2315
2316         assert(level - 64);
2317
2318         if((level&(~127)) == 0){
2319             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2320         }else
2321             bits+= esc_length;
2322     }
2323
2324     return bits;
2325 }
2326
2327 #define VSAD_INTRA(size) \
2328 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2329     int score=0;                                                                                            \
2330     int x,y;                                                                                                \
2331                                                                                                             \
2332     for(y=1; y<h; y++){                                                                                     \
2333         for(x=0; x<size; x+=4){                                                                             \
2334             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2335                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2336         }                                                                                                   \
2337         s+= stride;                                                                                         \
2338     }                                                                                                       \
2339                                                                                                             \
2340     return score;                                                                                           \
2341 }
2342 VSAD_INTRA(8)
2343 VSAD_INTRA(16)
2344
2345 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2346     int score=0;
2347     int x,y;
2348
2349     for(y=1; y<h; y++){
2350         for(x=0; x<16; x++){
2351             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2352         }
2353         s1+= stride;
2354         s2+= stride;
2355     }
2356
2357     return score;
2358 }
2359
2360 #define SQ(a) ((a)*(a))
2361 #define VSSE_INTRA(size) \
2362 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2363     int score=0;                                                                                            \
2364     int x,y;                                                                                                \
2365                                                                                                             \
2366     for(y=1; y<h; y++){                                                                                     \
2367         for(x=0; x<size; x+=4){                                                                               \
2368             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2369                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2370         }                                                                                                   \
2371         s+= stride;                                                                                         \
2372     }                                                                                                       \
2373                                                                                                             \
2374     return score;                                                                                           \
2375 }
2376 VSSE_INTRA(8)
2377 VSSE_INTRA(16)
2378
2379 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2380     int score=0;
2381     int x,y;
2382
2383     for(y=1; y<h; y++){
2384         for(x=0; x<16; x++){
2385             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2386         }
2387         s1+= stride;
2388         s2+= stride;
2389     }
2390
2391     return score;
2392 }
2393
2394 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2395                                int size){
2396     int score=0;
2397     int i;
2398     for(i=0; i<size; i++)
2399         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2400     return score;
2401 }
2402
2403 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2404 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2405 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2406 #if CONFIG_GPL
2407 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2408 #endif
2409 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2410 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2411 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2412 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2413
2414 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2415     int i;
2416     for(i=0; i<len; i++)
2417         dst[i] = src0[i] * src1[i];
2418 }
2419
2420 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2421     int i;
2422     src1 += len-1;
2423     for(i=0; i<len; i++)
2424         dst[i] = src0[i] * src1[-i];
2425 }
2426
2427 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2428     int i;
2429     for(i=0; i<len; i++)
2430         dst[i] = src0[i] * src1[i] + src2[i];
2431 }
2432
2433 static void vector_fmul_window_c(float *dst, const float *src0,
2434                                  const float *src1, const float *win, int len)
2435 {
2436     int i,j;
2437     dst += len;
2438     win += len;
2439     src0+= len;
2440     for(i=-len, j=len-1; i<0; i++, j--) {
2441         float s0 = src0[i];
2442         float s1 = src1[j];
2443         float wi = win[i];
2444         float wj = win[j];
2445         dst[i] = s0*wj - s1*wi;
2446         dst[j] = s0*wi + s1*wj;
2447     }
2448 }
2449
2450 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2451                                  int len)
2452 {
2453     int i;
2454     for (i = 0; i < len; i++)
2455         dst[i] = src[i] * mul;
2456 }
2457
2458 static void vector_fmac_scalar_c(float *dst, const float *src, float mul,
2459                                  int len)
2460 {
2461     int i;
2462     for (i = 0; i < len; i++)
2463         dst[i] += src[i] * mul;
2464 }
2465
2466 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2467                                 int len)
2468 {
2469     int i;
2470     for (i = 0; i < len; i++) {
2471         float t = v1[i] - v2[i];
2472         v1[i] += v2[i];
2473         v2[i] = t;
2474     }
2475 }
2476
2477 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2478 {
2479     float p = 0.0;
2480     int i;
2481
2482     for (i = 0; i < len; i++)
2483         p += v1[i] * v2[i];
2484
2485     return p;
2486 }
2487
2488 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2489                    uint32_t maxi, uint32_t maxisign)
2490 {
2491
2492     if(a > mini) return mini;
2493     else if((a^(1U<<31)) > maxisign) return maxi;
2494     else return a;
2495 }
2496
2497 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2498     int i;
2499     uint32_t mini = *(uint32_t*)min;
2500     uint32_t maxi = *(uint32_t*)max;
2501     uint32_t maxisign = maxi ^ (1U<<31);
2502     uint32_t *dsti = (uint32_t*)dst;
2503     const uint32_t *srci = (const uint32_t*)src;
2504     for(i=0; i<len; i+=8) {
2505         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2506         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2507         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2508         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2509         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2510         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2511         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2512         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2513     }
2514 }
2515 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2516     int i;
2517     if(min < 0 && max > 0) {
2518         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2519     } else {
2520         for(i=0; i < len; i+=8) {
2521             dst[i    ] = av_clipf(src[i    ], min, max);
2522             dst[i + 1] = av_clipf(src[i + 1], min, max);
2523             dst[i + 2] = av_clipf(src[i + 2], min, max);
2524             dst[i + 3] = av_clipf(src[i + 3], min, max);
2525             dst[i + 4] = av_clipf(src[i + 4], min, max);
2526             dst[i + 5] = av_clipf(src[i + 5], min, max);
2527             dst[i + 6] = av_clipf(src[i + 6], min, max);
2528             dst[i + 7] = av_clipf(src[i + 7], min, max);
2529         }
2530     }
2531 }
2532
2533 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2534 {
2535     int res = 0;
2536
2537     while (order--)
2538         res += (*v1++ * *v2++) >> shift;
2539
2540     return res;
2541 }
2542
2543 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2544 {
2545     int res = 0;
2546     while (order--) {
2547         res   += *v1 * *v2++;
2548         *v1++ += mul * *v3++;
2549     }
2550     return res;
2551 }
2552
2553 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2554                                  const int16_t *window, unsigned int len)
2555 {
2556     int i;
2557     int len2 = len >> 1;
2558
2559     for (i = 0; i < len2; i++) {
2560         int16_t w       = window[i];
2561         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2562         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2563     }
2564 }
2565
2566 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2567                                 int32_t max, unsigned int len)
2568 {
2569     do {
2570         *dst++ = av_clip(*src++, min, max);
2571         *dst++ = av_clip(*src++, min, max);
2572         *dst++ = av_clip(*src++, min, max);
2573         *dst++ = av_clip(*src++, min, max);
2574         *dst++ = av_clip(*src++, min, max);
2575         *dst++ = av_clip(*src++, min, max);
2576         *dst++ = av_clip(*src++, min, max);
2577         *dst++ = av_clip(*src++, min, max);
2578         len -= 8;
2579     } while (len > 0);
2580 }
2581
2582 #define W0 2048
2583 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2584 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2585 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2586 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2587 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2588 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2589 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2590
2591 static void wmv2_idct_row(short * b)
2592 {
2593     int s1,s2;
2594     int a0,a1,a2,a3,a4,a5,a6,a7;
2595     /*step 1*/
2596     a1 = W1*b[1]+W7*b[7];
2597     a7 = W7*b[1]-W1*b[7];
2598     a5 = W5*b[5]+W3*b[3];
2599     a3 = W3*b[5]-W5*b[3];
2600     a2 = W2*b[2]+W6*b[6];
2601     a6 = W6*b[2]-W2*b[6];
2602     a0 = W0*b[0]+W0*b[4];
2603     a4 = W0*b[0]-W0*b[4];
2604     /*step 2*/
2605     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2606     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2607     /*step 3*/
2608     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2609     b[1] = (a4+a6 +s1   + (1<<7))>>8;
2610     b[2] = (a4-a6 +s2   + (1<<7))>>8;
2611     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2612     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2613     b[5] = (a4-a6 -s2   + (1<<7))>>8;
2614     b[6] = (a4+a6 -s1   + (1<<7))>>8;
2615     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2616 }
2617 static void wmv2_idct_col(short * b)
2618 {
2619     int s1,s2;
2620     int a0,a1,a2,a3,a4,a5,a6,a7;
2621     /*step 1, with extended precision*/
2622     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2623     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2624     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2625     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2626     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2627     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2628     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2629     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2630     /*step 2*/
2631     s1 = (181*(a1-a5+a7-a3)+128)>>8;
2632     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2633     /*step 3*/
2634     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2635     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2636     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2637     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2638
2639     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2640     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2641     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2642     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2643 }
2644 void ff_wmv2_idct_c(short * block){
2645     int i;
2646
2647     for(i=0;i<64;i+=8){
2648         wmv2_idct_row(block+i);
2649     }
2650     for(i=0;i<8;i++){
2651         wmv2_idct_col(block+i);
2652     }
2653 }
2654 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2655  converted */
2656 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2657 {
2658     ff_wmv2_idct_c(block);
2659     ff_put_pixels_clamped_c(block, dest, line_size);
2660 }
2661 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2662 {
2663     ff_wmv2_idct_c(block);
2664     ff_add_pixels_clamped_c(block, dest, line_size);
2665 }
2666 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2667 {
2668     j_rev_dct (block);
2669     ff_put_pixels_clamped_c(block, dest, line_size);
2670 }
2671 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2672 {
2673     j_rev_dct (block);
2674     ff_add_pixels_clamped_c(block, dest, line_size);
2675 }
2676
2677 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2678 {
2679     j_rev_dct4 (block);
2680     put_pixels_clamped4_c(block, dest, line_size);
2681 }
2682 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2683 {
2684     j_rev_dct4 (block);
2685     add_pixels_clamped4_c(block, dest, line_size);
2686 }
2687
2688 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2689 {
2690     j_rev_dct2 (block);
2691     put_pixels_clamped2_c(block, dest, line_size);
2692 }
2693 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2694 {
2695     j_rev_dct2 (block);
2696     add_pixels_clamped2_c(block, dest, line_size);
2697 }
2698
2699 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2700 {
2701     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2702
2703     dest[0] = cm[(block[0] + 4)>>3];
2704 }
2705 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2706 {
2707     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2708
2709     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2710 }
2711
2712 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2713
2714 /* init static data */
2715 av_cold void dsputil_static_init(void)
2716 {
2717     int i;
2718
2719     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2720     for(i=0;i<MAX_NEG_CROP;i++) {
2721         ff_cropTbl[i] = 0;
2722         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2723     }
2724
2725     for(i=0;i<512;i++) {
2726         ff_squareTbl[i] = (i - 256) * (i - 256);
2727     }
2728
2729     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2730 }
2731
2732 int ff_check_alignment(void){
2733     static int did_fail=0;
2734     LOCAL_ALIGNED_16(int, aligned, [4]);
2735
2736     if((intptr_t)aligned & 15){
2737         if(!did_fail){
2738 #if HAVE_MMX || HAVE_ALTIVEC
2739             av_log(NULL, AV_LOG_ERROR,
2740                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2741                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2742                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2743                 "Do not report crashes to Libav developers.\n");
2744 #endif
2745             did_fail=1;
2746         }
2747         return -1;
2748     }
2749     return 0;
2750 }
2751
2752 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2753 {
2754     int i;
2755
2756     ff_check_alignment();
2757
2758 #if CONFIG_ENCODERS
2759     if (avctx->bits_per_raw_sample == 10) {
2760         c->fdct    = ff_jpeg_fdct_islow_10;
2761         c->fdct248 = ff_fdct248_islow_10;
2762     } else {
2763         if(avctx->dct_algo==FF_DCT_FASTINT) {
2764             c->fdct    = fdct_ifast;
2765             c->fdct248 = fdct_ifast248;
2766         }
2767         else if(avctx->dct_algo==FF_DCT_FAAN) {
2768             c->fdct    = ff_faandct;
2769             c->fdct248 = ff_faandct248;
2770         }
2771         else {
2772             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2773             c->fdct248 = ff_fdct248_islow_8;
2774         }
2775     }
2776 #endif //CONFIG_ENCODERS
2777
2778     if(avctx->lowres==1){
2779         c->idct_put= ff_jref_idct4_put;
2780         c->idct_add= ff_jref_idct4_add;
2781         c->idct    = j_rev_dct4;
2782         c->idct_permutation_type= FF_NO_IDCT_PERM;
2783     }else if(avctx->lowres==2){
2784         c->idct_put= ff_jref_idct2_put;
2785         c->idct_add= ff_jref_idct2_add;
2786         c->idct    = j_rev_dct2;
2787         c->idct_permutation_type= FF_NO_IDCT_PERM;
2788     }else if(avctx->lowres==3){
2789         c->idct_put= ff_jref_idct1_put;
2790         c->idct_add= ff_jref_idct1_add;
2791         c->idct    = j_rev_dct1;
2792         c->idct_permutation_type= FF_NO_IDCT_PERM;
2793     }else{
2794         if (avctx->bits_per_raw_sample == 10) {
2795             c->idct_put              = ff_simple_idct_put_10;
2796             c->idct_add              = ff_simple_idct_add_10;
2797             c->idct                  = ff_simple_idct_10;
2798             c->idct_permutation_type = FF_NO_IDCT_PERM;
2799         } else {
2800         if(avctx->idct_algo==FF_IDCT_INT){
2801             c->idct_put= ff_jref_idct_put;
2802             c->idct_add= ff_jref_idct_add;
2803             c->idct    = j_rev_dct;
2804             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2805         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2806                 avctx->idct_algo==FF_IDCT_VP3){
2807             c->idct_put= ff_vp3_idct_put_c;
2808             c->idct_add= ff_vp3_idct_add_c;
2809             c->idct    = ff_vp3_idct_c;
2810             c->idct_permutation_type= FF_NO_IDCT_PERM;
2811         }else if(avctx->idct_algo==FF_IDCT_WMV2){
2812             c->idct_put= ff_wmv2_idct_put_c;
2813             c->idct_add= ff_wmv2_idct_add_c;
2814             c->idct    = ff_wmv2_idct_c;
2815             c->idct_permutation_type= FF_NO_IDCT_PERM;
2816         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2817             c->idct_put= ff_faanidct_put;
2818             c->idct_add= ff_faanidct_add;
2819             c->idct    = ff_faanidct;
2820             c->idct_permutation_type= FF_NO_IDCT_PERM;
2821         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2822             c->idct_put= ff_ea_idct_put_c;
2823             c->idct_permutation_type= FF_NO_IDCT_PERM;
2824         }else{ //accurate/default
2825             c->idct_put = ff_simple_idct_put_8;
2826             c->idct_add = ff_simple_idct_add_8;
2827             c->idct     = ff_simple_idct_8;
2828             c->idct_permutation_type= FF_NO_IDCT_PERM;
2829         }
2830         }
2831     }
2832
2833     c->diff_pixels = diff_pixels_c;
2834     c->put_pixels_clamped = ff_put_pixels_clamped_c;
2835     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2836     c->add_pixels_clamped = ff_add_pixels_clamped_c;
2837     c->sum_abs_dctelem = sum_abs_dctelem_c;
2838     c->gmc1 = gmc1_c;
2839     c->gmc = ff_gmc_c;
2840     c->pix_sum = pix_sum_c;
2841     c->pix_norm1 = pix_norm1_c;
2842
2843     c->fill_block_tab[0] = fill_block16_c;
2844     c->fill_block_tab[1] = fill_block8_c;
2845
2846     /* TODO [0] 16  [1] 8 */
2847     c->pix_abs[0][0] = pix_abs16_c;
2848     c->pix_abs[0][1] = pix_abs16_x2_c;
2849     c->pix_abs[0][2] = pix_abs16_y2_c;
2850     c->pix_abs[0][3] = pix_abs16_xy2_c;
2851     c->pix_abs[1][0] = pix_abs8_c;
2852     c->pix_abs[1][1] = pix_abs8_x2_c;
2853     c->pix_abs[1][2] = pix_abs8_y2_c;
2854     c->pix_abs[1][3] = pix_abs8_xy2_c;
2855
2856     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2857     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2858     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2859     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2860     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2861     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2862     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2863     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2864     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2865
2866     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2867     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2868     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2869     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2870     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2871     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2872     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2873     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2874     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2875
2876 #define dspfunc(PFX, IDX, NUM) \
2877     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2878     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2879     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2880     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2881     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2882     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2883     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2884     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2885     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2886     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2887     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2888     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2889     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2890     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2891     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2892     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2893
2894     dspfunc(put_qpel, 0, 16);
2895     dspfunc(put_no_rnd_qpel, 0, 16);
2896
2897     dspfunc(avg_qpel, 0, 16);
2898     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2899
2900     dspfunc(put_qpel, 1, 8);
2901     dspfunc(put_no_rnd_qpel, 1, 8);
2902
2903     dspfunc(avg_qpel, 1, 8);
2904     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2905
2906 #undef dspfunc
2907
2908 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2909     ff_mlp_init(c, avctx);
2910 #endif
2911 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2912     ff_intrax8dsp_init(c,avctx);
2913 #endif
2914
2915     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2916     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2917     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2918     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2919     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2920     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2921     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2922     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2923
2924 #define SET_CMP_FUNC(name) \
2925     c->name[0]= name ## 16_c;\
2926     c->name[1]= name ## 8x8_c;
2927
2928     SET_CMP_FUNC(hadamard8_diff)
2929     c->hadamard8_diff[4]= hadamard8_intra16_c;
2930     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2931     SET_CMP_FUNC(dct_sad)
2932     SET_CMP_FUNC(dct_max)
2933 #if CONFIG_GPL
2934     SET_CMP_FUNC(dct264_sad)
2935 #endif
2936     c->sad[0]= pix_abs16_c;
2937     c->sad[1]= pix_abs8_c;
2938     c->sse[0]= sse16_c;
2939     c->sse[1]= sse8_c;
2940     c->sse[2]= sse4_c;
2941     SET_CMP_FUNC(quant_psnr)
2942     SET_CMP_FUNC(rd)
2943     SET_CMP_FUNC(bit)
2944     c->vsad[0]= vsad16_c;
2945     c->vsad[4]= vsad_intra16_c;
2946     c->vsad[5]= vsad_intra8_c;
2947     c->vsse[0]= vsse16_c;
2948     c->vsse[4]= vsse_intra16_c;
2949     c->vsse[5]= vsse_intra8_c;
2950     c->nsse[0]= nsse16_c;
2951     c->nsse[1]= nsse8_c;
2952 #if CONFIG_DWT
2953     ff_dsputil_init_dwt(c);
2954 #endif
2955
2956     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2957
2958     c->add_bytes= add_bytes_c;
2959     c->add_bytes_l2= add_bytes_l2_c;
2960     c->diff_bytes= diff_bytes_c;
2961     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2962     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2963     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
2964     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2965     c->bswap_buf= bswap_buf;
2966     c->bswap16_buf = bswap16_buf;
2967 #if CONFIG_PNG_DECODER
2968     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
2969 #endif
2970
2971     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2972         c->h263_h_loop_filter= h263_h_loop_filter_c;
2973         c->h263_v_loop_filter= h263_v_loop_filter_c;
2974     }
2975
2976     if (CONFIG_VP3_DECODER) {
2977         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
2978         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
2979         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
2980     }
2981
2982     c->h261_loop_filter= h261_loop_filter_c;
2983
2984     c->try_8x8basis= try_8x8basis_c;
2985     c->add_8x8basis= add_8x8basis_c;
2986
2987 #if CONFIG_VORBIS_DECODER
2988     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
2989 #endif
2990 #if CONFIG_AC3_DECODER
2991     c->ac3_downmix = ff_ac3_downmix_c;
2992 #endif
2993     c->vector_fmul = vector_fmul_c;
2994     c->vector_fmul_reverse = vector_fmul_reverse_c;
2995     c->vector_fmul_add = vector_fmul_add_c;
2996     c->vector_fmul_window = vector_fmul_window_c;
2997     c->vector_clipf = vector_clipf_c;
2998     c->scalarproduct_int16 = scalarproduct_int16_c;
2999     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3000     c->apply_window_int16 = apply_window_int16_c;
3001     c->vector_clip_int32 = vector_clip_int32_c;
3002     c->scalarproduct_float = scalarproduct_float_c;
3003     c->butterflies_float = butterflies_float_c;
3004     c->vector_fmul_scalar = vector_fmul_scalar_c;
3005     c->vector_fmac_scalar = vector_fmac_scalar_c;
3006
3007     c->shrink[0]= av_image_copy_plane;
3008     c->shrink[1]= ff_shrink22;
3009     c->shrink[2]= ff_shrink44;
3010     c->shrink[3]= ff_shrink88;
3011
3012     c->prefetch= just_return;
3013
3014     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3015     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3016
3017 #undef FUNC
3018 #undef FUNCC
3019 #define FUNC(f, depth) f ## _ ## depth
3020 #define FUNCC(f, depth) f ## _ ## depth ## _c
3021
3022 #define dspfunc1(PFX, IDX, NUM, depth)\
3023     c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
3024     c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3025     c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3026     c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3027
3028 #define dspfunc2(PFX, IDX, NUM, depth)\
3029     c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3030     c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3031     c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3032     c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3033     c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3034     c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3035     c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3036     c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3037     c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3038     c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3039     c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3040     c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3041     c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3042     c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3043     c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3044     c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3045
3046
3047 #define BIT_DEPTH_FUNCS(depth, dct)\
3048     c->get_pixels                    = FUNCC(get_pixels   ## dct   , depth);\
3049     c->draw_edges                    = FUNCC(draw_edges            , depth);\
3050     c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
3051     c->clear_block                   = FUNCC(clear_block  ## dct   , depth);\
3052     c->clear_blocks                  = FUNCC(clear_blocks ## dct   , depth);\
3053     c->add_pixels8                   = FUNCC(add_pixels8  ## dct   , depth);\
3054     c->add_pixels4                   = FUNCC(add_pixels4  ## dct   , depth);\
3055     c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
3056     c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3057 \
3058     c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
3059     c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
3060     c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
3061     c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
3062     c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
3063     c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
3064 \
3065     dspfunc1(put       , 0, 16, depth);\
3066     dspfunc1(put       , 1,  8, depth);\
3067     dspfunc1(put       , 2,  4, depth);\
3068     dspfunc1(put       , 3,  2, depth);\
3069     dspfunc1(put_no_rnd, 0, 16, depth);\
3070     dspfunc1(put_no_rnd, 1,  8, depth);\
3071     dspfunc1(avg       , 0, 16, depth);\
3072     dspfunc1(avg       , 1,  8, depth);\
3073     dspfunc1(avg       , 2,  4, depth);\
3074     dspfunc1(avg       , 3,  2, depth);\
3075     dspfunc1(avg_no_rnd, 0, 16, depth);\
3076     dspfunc1(avg_no_rnd, 1,  8, depth);\
3077 \
3078     dspfunc2(put_h264_qpel, 0, 16, depth);\
3079     dspfunc2(put_h264_qpel, 1,  8, depth);\
3080     dspfunc2(put_h264_qpel, 2,  4, depth);\
3081     dspfunc2(put_h264_qpel, 3,  2, depth);\
3082     dspfunc2(avg_h264_qpel, 0, 16, depth);\
3083     dspfunc2(avg_h264_qpel, 1,  8, depth);\
3084     dspfunc2(avg_h264_qpel, 2,  4, depth);
3085
3086     switch (avctx->bits_per_raw_sample) {
3087     case 9:
3088         if (c->dct_bits == 32) {
3089             BIT_DEPTH_FUNCS(9, _32);
3090         } else {
3091             BIT_DEPTH_FUNCS(9, _16);
3092         }
3093         break;
3094     case 10:
3095         if (c->dct_bits == 32) {
3096             BIT_DEPTH_FUNCS(10, _32);
3097         } else {
3098             BIT_DEPTH_FUNCS(10, _16);
3099         }
3100         break;
3101     default:
3102         av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3103     case 8:
3104         BIT_DEPTH_FUNCS(8, _16);
3105         break;
3106     }
3107
3108
3109     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
3110     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
3111     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
3112     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
3113     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
3114     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
3115     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
3116     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
3117     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
3118
3119     for(i=0; i<64; i++){
3120         if(!c->put_2tap_qpel_pixels_tab[0][i])
3121             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3122         if(!c->avg_2tap_qpel_pixels_tab[0][i])
3123             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3124     }
3125
3126     switch(c->idct_permutation_type){
3127     case FF_NO_IDCT_PERM:
3128         for(i=0; i<64; i++)
3129             c->idct_permutation[i]= i;
3130         break;
3131     case FF_LIBMPEG2_IDCT_PERM:
3132         for(i=0; i<64; i++)
3133             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3134         break;
3135     case FF_SIMPLE_IDCT_PERM:
3136         for(i=0; i<64; i++)
3137             c->idct_permutation[i]= simple_mmx_permutation[i];
3138         break;
3139     case FF_TRANSPOSE_IDCT_PERM:
3140         for(i=0; i<64; i++)
3141             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3142         break;
3143     case FF_PARTTRANS_IDCT_PERM:
3144         for(i=0; i<64; i++)
3145             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3146         break;
3147     case FF_SSE2_IDCT_PERM:
3148         for(i=0; i<64; i++)
3149             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3150         break;
3151     default:
3152         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3153     }
3154 }