git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41
  42 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  43 uint32_t ff_squareTbl[512] = {0, };
  44
  45 #define BIT_DEPTH 9
  46 #include "dsputil_template.c"
  47 #undef BIT_DEPTH
  48
  49 #define BIT_DEPTH 10
  50 #include "dsputil_template.c"
  51 #undef BIT_DEPTH
  52
  53 #define BIT_DEPTH 8
  54 #include "dsputil_template.c"
  55
  56 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  57 #define pb_7f (~0UL/255 * 0x7f)
  58 #define pb_80 (~0UL/255 * 0x80)
  59
  60 const uint8_t ff_zigzag_direct[64] = {
  61     0,   1,  8, 16,  9,  2,  3, 10,
  62     17, 24, 32, 25, 18, 11,  4,  5,
  63     12, 19, 26, 33, 40, 48, 41, 34,
  64     27, 20, 13,  6,  7, 14, 21, 28,
  65     35, 42, 49, 56, 57, 50, 43, 36,
  66     29, 22, 15, 23, 30, 37, 44, 51,
  67     58, 59, 52, 45, 38, 31, 39, 46,
  68     53, 60, 61, 54, 47, 55, 62, 63
  69 };
  70
  71 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  72    specification, we interleave the fields */
  73 const uint8_t ff_zigzag248_direct[64] = {
  74      0,  8,  1,  9, 16, 24,  2, 10,
  75     17, 25, 32, 40, 48, 56, 33, 41,
  76     18, 26,  3, 11,  4, 12, 19, 27,
  77     34, 42, 49, 57, 50, 58, 35, 43,
  78     20, 28,  5, 13,  6, 14, 21, 29,
  79     36, 44, 51, 59, 52, 60, 37, 45,
  80     22, 30,  7, 15, 23, 31, 38, 46,
  81     53, 61, 54, 62, 39, 47, 55, 63,
  82 };
  83
  84 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  85 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  86
  87 const uint8_t ff_alternate_horizontal_scan[64] = {
  88     0,  1,   2,  3,  8,  9, 16, 17,
  89     10, 11,  4,  5,  6,  7, 15, 14,
  90     13, 12, 19, 18, 24, 25, 32, 33,
  91     26, 27, 20, 21, 22, 23, 28, 29,
  92     30, 31, 34, 35, 40, 41, 48, 49,
  93     42, 43, 36, 37, 38, 39, 44, 45,
  94     46, 47, 50, 51, 56, 57, 58, 59,
  95     52, 53, 54, 55, 60, 61, 62, 63,
  96 };
  97
  98 const uint8_t ff_alternate_vertical_scan[64] = {
  99     0,  8,  16, 24,  1,  9,  2, 10,
 100     17, 25, 32, 40, 48, 56, 57, 49,
 101     41, 33, 26, 18,  3, 11,  4, 12,
 102     19, 27, 34, 42, 50, 58, 35, 43,
 103     51, 59, 20, 28,  5, 13,  6, 14,
 104     21, 29, 36, 44, 52, 60, 37, 45,
 105     53, 61, 22, 30,  7, 15, 23, 31,
 106     38, 46, 54, 62, 39, 47, 55, 63,
 107 };
 108
 109 /* Input permutation for the simple_idct_mmx */
 110 static const uint8_t simple_mmx_permutation[64]={
 111         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 112         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 113         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 114         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 115         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 116         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 117         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 118         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 119 };
 120
 121 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 122
 123 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 124     int i;
 125     int end;
 126
 127     st->scantable= src_scantable;
 128
 129     for(i=0; i<64; i++){
 130         int j;
 131         j = src_scantable[i];
 132         st->permutated[i] = permutation[j];
 133 #if ARCH_PPC
 134         st->inverse[j] = i;
 135 #endif
 136     }
 137
 138     end=-1;
 139     for(i=0; i<64; i++){
 140         int j;
 141         j = st->permutated[i];
 142         if(j>end) end=j;
 143         st->raster_end[i]= end;
 144     }
 145 }
 146
 147 static int pix_sum_c(uint8_t * pix, int line_size)
 148 {
 149     int s, i, j;
 150
 151     s = 0;
 152     for (i = 0; i < 16; i++) {
 153         for (j = 0; j < 16; j += 8) {
 154             s += pix[0];
 155             s += pix[1];
 156             s += pix[2];
 157             s += pix[3];
 158             s += pix[4];
 159             s += pix[5];
 160             s += pix[6];
 161             s += pix[7];
 162             pix += 8;
 163         }
 164         pix += line_size - 16;
 165     }
 166     return s;
 167 }
 168
 169 static int pix_norm1_c(uint8_t * pix, int line_size)
 170 {
 171     int s, i, j;
 172     uint32_t *sq = ff_squareTbl + 256;
 173
 174     s = 0;
 175     for (i = 0; i < 16; i++) {
 176         for (j = 0; j < 16; j += 8) {
 177 #if 0
 178             s += sq[pix[0]];
 179             s += sq[pix[1]];
 180             s += sq[pix[2]];
 181             s += sq[pix[3]];
 182             s += sq[pix[4]];
 183             s += sq[pix[5]];
 184             s += sq[pix[6]];
 185             s += sq[pix[7]];
 186 #else
 187 #if HAVE_FAST_64BIT
 188             register uint64_t x=*(uint64_t*)pix;
 189             s += sq[x&0xff];
 190             s += sq[(x>>8)&0xff];
 191             s += sq[(x>>16)&0xff];
 192             s += sq[(x>>24)&0xff];
 193             s += sq[(x>>32)&0xff];
 194             s += sq[(x>>40)&0xff];
 195             s += sq[(x>>48)&0xff];
 196             s += sq[(x>>56)&0xff];
 197 #else
 198             register uint32_t x=*(uint32_t*)pix;
 199             s += sq[x&0xff];
 200             s += sq[(x>>8)&0xff];
 201             s += sq[(x>>16)&0xff];
 202             s += sq[(x>>24)&0xff];
 203             x=*(uint32_t*)(pix+4);
 204             s += sq[x&0xff];
 205             s += sq[(x>>8)&0xff];
 206             s += sq[(x>>16)&0xff];
 207             s += sq[(x>>24)&0xff];
 208 #endif
 209 #endif
 210             pix += 8;
 211         }
 212         pix += line_size - 16;
 213     }
 214     return s;
 215 }
 216
 217 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 218     int i;
 219
 220     for(i=0; i+8<=w; i+=8){
 221         dst[i+0]= av_bswap32(src[i+0]);
 222         dst[i+1]= av_bswap32(src[i+1]);
 223         dst[i+2]= av_bswap32(src[i+2]);
 224         dst[i+3]= av_bswap32(src[i+3]);
 225         dst[i+4]= av_bswap32(src[i+4]);
 226         dst[i+5]= av_bswap32(src[i+5]);
 227         dst[i+6]= av_bswap32(src[i+6]);
 228         dst[i+7]= av_bswap32(src[i+7]);
 229     }
 230     for(;i<w; i++){
 231         dst[i+0]= av_bswap32(src[i+0]);
 232     }
 233 }
 234
 235 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 236 {
 237     while (len--)
 238         *dst++ = av_bswap16(*src++);
 239 }
 240
 241 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 242 {
 243     int s, i;
 244     uint32_t *sq = ff_squareTbl + 256;
 245
 246     s = 0;
 247     for (i = 0; i < h; i++) {
 248         s += sq[pix1[0] - pix2[0]];
 249         s += sq[pix1[1] - pix2[1]];
 250         s += sq[pix1[2] - pix2[2]];
 251         s += sq[pix1[3] - pix2[3]];
 252         pix1 += line_size;
 253         pix2 += line_size;
 254     }
 255     return s;
 256 }
 257
 258 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 259 {
 260     int s, i;
 261     uint32_t *sq = ff_squareTbl + 256;
 262
 263     s = 0;
 264     for (i = 0; i < h; i++) {
 265         s += sq[pix1[0] - pix2[0]];
 266         s += sq[pix1[1] - pix2[1]];
 267         s += sq[pix1[2] - pix2[2]];
 268         s += sq[pix1[3] - pix2[3]];
 269         s += sq[pix1[4] - pix2[4]];
 270         s += sq[pix1[5] - pix2[5]];
 271         s += sq[pix1[6] - pix2[6]];
 272         s += sq[pix1[7] - pix2[7]];
 273         pix1 += line_size;
 274         pix2 += line_size;
 275     }
 276     return s;
 277 }
 278
 279 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 280 {
 281     int s, i;
 282     uint32_t *sq = ff_squareTbl + 256;
 283
 284     s = 0;
 285     for (i = 0; i < h; i++) {
 286         s += sq[pix1[ 0] - pix2[ 0]];
 287         s += sq[pix1[ 1] - pix2[ 1]];
 288         s += sq[pix1[ 2] - pix2[ 2]];
 289         s += sq[pix1[ 3] - pix2[ 3]];
 290         s += sq[pix1[ 4] - pix2[ 4]];
 291         s += sq[pix1[ 5] - pix2[ 5]];
 292         s += sq[pix1[ 6] - pix2[ 6]];
 293         s += sq[pix1[ 7] - pix2[ 7]];
 294         s += sq[pix1[ 8] - pix2[ 8]];
 295         s += sq[pix1[ 9] - pix2[ 9]];
 296         s += sq[pix1[10] - pix2[10]];
 297         s += sq[pix1[11] - pix2[11]];
 298         s += sq[pix1[12] - pix2[12]];
 299         s += sq[pix1[13] - pix2[13]];
 300         s += sq[pix1[14] - pix2[14]];
 301         s += sq[pix1[15] - pix2[15]];
 302
 303         pix1 += line_size;
 304         pix2 += line_size;
 305     }
 306     return s;
 307 }
 308
 309 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 310                           const uint8_t *s2, int stride){
 311     int i;
 312
 313     /* read the pixels */
 314     for(i=0;i<8;i++) {
 315         block[0] = s1[0] - s2[0];
 316         block[1] = s1[1] - s2[1];
 317         block[2] = s1[2] - s2[2];
 318         block[3] = s1[3] - s2[3];
 319         block[4] = s1[4] - s2[4];
 320         block[5] = s1[5] - s2[5];
 321         block[6] = s1[6] - s2[6];
 322         block[7] = s1[7] - s2[7];
 323         s1 += stride;
 324         s2 += stride;
 325         block += 8;
 326     }
 327 }
 328
 329
 330 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 331                              int line_size)
 332 {
 333     int i;
 334     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 335
 336     /* read the pixels */
 337     for(i=0;i<8;i++) {
 338         pixels[0] = cm[block[0]];
 339         pixels[1] = cm[block[1]];
 340         pixels[2] = cm[block[2]];
 341         pixels[3] = cm[block[3]];
 342         pixels[4] = cm[block[4]];
 343         pixels[5] = cm[block[5]];
 344         pixels[6] = cm[block[6]];
 345         pixels[7] = cm[block[7]];
 346
 347         pixels += line_size;
 348         block += 8;
 349     }
 350 }
 351
 352 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 353                                  int line_size)
 354 {
 355     int i;
 356     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 357
 358     /* read the pixels */
 359     for(i=0;i<4;i++) {
 360         pixels[0] = cm[block[0]];
 361         pixels[1] = cm[block[1]];
 362         pixels[2] = cm[block[2]];
 363         pixels[3] = cm[block[3]];
 364
 365         pixels += line_size;
 366         block += 8;
 367     }
 368 }
 369
 370 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 371                                  int line_size)
 372 {
 373     int i;
 374     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 375
 376     /* read the pixels */
 377     for(i=0;i<2;i++) {
 378         pixels[0] = cm[block[0]];
 379         pixels[1] = cm[block[1]];
 380
 381         pixels += line_size;
 382         block += 8;
 383     }
 384 }
 385
 386 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
 387                                     uint8_t *restrict pixels,
 388                                     int line_size)
 389 {
 390     int i, j;
 391
 392     for (i = 0; i < 8; i++) {
 393         for (j = 0; j < 8; j++) {
 394             if (*block < -128)
 395                 *pixels = 0;
 396             else if (*block > 127)
 397                 *pixels = 255;
 398             else
 399                 *pixels = (uint8_t)(*block + 128);
 400             block++;
 401             pixels++;
 402         }
 403         pixels += (line_size - 8);
 404     }
 405 }
 406
 407 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 408                              int line_size)
 409 {
 410     int i;
 411     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 412
 413     /* read the pixels */
 414     for(i=0;i<8;i++) {
 415         pixels[0] = cm[pixels[0] + block[0]];
 416         pixels[1] = cm[pixels[1] + block[1]];
 417         pixels[2] = cm[pixels[2] + block[2]];
 418         pixels[3] = cm[pixels[3] + block[3]];
 419         pixels[4] = cm[pixels[4] + block[4]];
 420         pixels[5] = cm[pixels[5] + block[5]];
 421         pixels[6] = cm[pixels[6] + block[6]];
 422         pixels[7] = cm[pixels[7] + block[7]];
 423         pixels += line_size;
 424         block += 8;
 425     }
 426 }
 427
 428 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 429                           int line_size)
 430 {
 431     int i;
 432     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 433
 434     /* read the pixels */
 435     for(i=0;i<4;i++) {
 436         pixels[0] = cm[pixels[0] + block[0]];
 437         pixels[1] = cm[pixels[1] + block[1]];
 438         pixels[2] = cm[pixels[2] + block[2]];
 439         pixels[3] = cm[pixels[3] + block[3]];
 440         pixels += line_size;
 441         block += 8;
 442     }
 443 }
 444
 445 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 446                           int line_size)
 447 {
 448     int i;
 449     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 450
 451     /* read the pixels */
 452     for(i=0;i<2;i++) {
 453         pixels[0] = cm[pixels[0] + block[0]];
 454         pixels[1] = cm[pixels[1] + block[1]];
 455         pixels += line_size;
 456         block += 8;
 457     }
 458 }
 459
 460 static int sum_abs_dctelem_c(DCTELEM *block)
 461 {
 462     int sum=0, i;
 463     for(i=0; i<64; i++)
 464         sum+= FFABS(block[i]);
 465     return sum;
 466 }
 467
 468 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 469 {
 470     int i;
 471
 472     for (i = 0; i < h; i++) {
 473         memset(block, value, 16);
 474         block += line_size;
 475     }
 476 }
 477
 478 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 479 {
 480     int i;
 481
 482     for (i = 0; i < h; i++) {
 483         memset(block, value, 8);
 484         block += line_size;
 485     }
 486 }
 487
 488 #define avg2(a,b) ((a+b+1)>>1)
 489 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 490
 491 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 492 {
 493     const int A=(16-x16)*(16-y16);
 494     const int B=(   x16)*(16-y16);
 495     const int C=(16-x16)*(   y16);
 496     const int D=(   x16)*(   y16);
 497     int i;
 498
 499     for(i=0; i<h; i++)
 500     {
 501         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 502         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 503         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 504         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 505         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 506         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 507         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 508         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 509         dst+= stride;
 510         src+= stride;
 511     }
 512 }
 513
 514 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 515                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 516 {
 517     int y, vx, vy;
 518     const int s= 1<<shift;
 519
 520     width--;
 521     height--;
 522
 523     for(y=0; y<h; y++){
 524         int x;
 525
 526         vx= ox;
 527         vy= oy;
 528         for(x=0; x<8; x++){ //XXX FIXME optimize
 529             int src_x, src_y, frac_x, frac_y, index;
 530
 531             src_x= vx>>16;
 532             src_y= vy>>16;
 533             frac_x= src_x&(s-1);
 534             frac_y= src_y&(s-1);
 535             src_x>>=shift;
 536             src_y>>=shift;
 537
 538             if((unsigned)src_x < width){
 539                 if((unsigned)src_y < height){
 540                     index= src_x + src_y*stride;
 541                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 542                                            + src[index       +1]*   frac_x )*(s-frac_y)
 543                                         + (  src[index+stride  ]*(s-frac_x)
 544                                            + src[index+stride+1]*   frac_x )*   frac_y
 545                                         + r)>>(shift*2);
 546                 }else{
 547                     index= src_x + av_clip(src_y, 0, height)*stride;
 548                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 549                                           + src[index       +1]*   frac_x )*s
 550                                         + r)>>(shift*2);
 551                 }
 552             }else{
 553                 if((unsigned)src_y < height){
 554                     index= av_clip(src_x, 0, width) + src_y*stride;
 555                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 556                                            + src[index+stride  ]*   frac_y )*s
 557                                         + r)>>(shift*2);
 558                 }else{
 559                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 560                     dst[y*stride + x]=    src[index         ];
 561                 }
 562             }
 563
 564             vx+= dxx;
 565             vy+= dyx;
 566         }
 567         ox += dxy;
 568         oy += dyy;
 569     }
 570 }
 571
 572 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 573     switch(width){
 574     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 575     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 576     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 577     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 578     }
 579 }
 580
 581 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 582     int i,j;
 583     for (i=0; i < height; i++) {
 584       for (j=0; j < width; j++) {
 585         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 586       }
 587       src += stride;
 588       dst += stride;
 589     }
 590 }
 591
 592 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 593     int i,j;
 594     for (i=0; i < height; i++) {
 595       for (j=0; j < width; j++) {
 596         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 597       }
 598       src += stride;
 599       dst += stride;
 600     }
 601 }
 602
 603 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 604     int i,j;
 605     for (i=0; i < height; i++) {
 606       for (j=0; j < width; j++) {
 607         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 608       }
 609       src += stride;
 610       dst += stride;
 611     }
 612 }
 613
 614 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 615     int i,j;
 616     for (i=0; i < height; i++) {
 617       for (j=0; j < width; j++) {
 618         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 619       }
 620       src += stride;
 621       dst += stride;
 622     }
 623 }
 624
 625 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 626     int i,j;
 627     for (i=0; i < height; i++) {
 628       for (j=0; j < width; j++) {
 629         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 630       }
 631       src += stride;
 632       dst += stride;
 633     }
 634 }
 635
 636 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 637     int i,j;
 638     for (i=0; i < height; i++) {
 639       for (j=0; j < width; j++) {
 640         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 641       }
 642       src += stride;
 643       dst += stride;
 644     }
 645 }
 646
 647 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 648     int i,j;
 649     for (i=0; i < height; i++) {
 650       for (j=0; j < width; j++) {
 651         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 652       }
 653       src += stride;
 654       dst += stride;
 655     }
 656 }
 657
 658 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 659     int i,j;
 660     for (i=0; i < height; i++) {
 661       for (j=0; j < width; j++) {
 662         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 663       }
 664       src += stride;
 665       dst += stride;
 666     }
 667 }
 668
 669 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 670     switch(width){
 671     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 672     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 673     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 674     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 675     }
 676 }
 677
 678 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 679     int i,j;
 680     for (i=0; i < height; i++) {
 681       for (j=0; j < width; j++) {
 682         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 683       }
 684       src += stride;
 685       dst += stride;
 686     }
 687 }
 688
 689 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 690     int i,j;
 691     for (i=0; i < height; i++) {
 692       for (j=0; j < width; j++) {
 693         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 694       }
 695       src += stride;
 696       dst += stride;
 697     }
 698 }
 699
 700 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 701     int i,j;
 702     for (i=0; i < height; i++) {
 703       for (j=0; j < width; j++) {
 704         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 705       }
 706       src += stride;
 707       dst += stride;
 708     }
 709 }
 710
 711 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 712     int i,j;
 713     for (i=0; i < height; i++) {
 714       for (j=0; j < width; j++) {
 715         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 716       }
 717       src += stride;
 718       dst += stride;
 719     }
 720 }
 721
 722 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 723     int i,j;
 724     for (i=0; i < height; i++) {
 725       for (j=0; j < width; j++) {
 726         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 727       }
 728       src += stride;
 729       dst += stride;
 730     }
 731 }
 732
 733 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 734     int i,j;
 735     for (i=0; i < height; i++) {
 736       for (j=0; j < width; j++) {
 737         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 738       }
 739       src += stride;
 740       dst += stride;
 741     }
 742 }
 743
 744 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 745     int i,j;
 746     for (i=0; i < height; i++) {
 747       for (j=0; j < width; j++) {
 748         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 749       }
 750       src += stride;
 751       dst += stride;
 752     }
 753 }
 754
 755 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 756     int i,j;
 757     for (i=0; i < height; i++) {
 758       for (j=0; j < width; j++) {
 759         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 760       }
 761       src += stride;
 762       dst += stride;
 763     }
 764 }
 765
 766 #define QPEL_MC(r, OPNAME, RND, OP) \
 767 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 768     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 769     int i;\
 770     for(i=0; i<h; i++)\
 771     {\
 772         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 773         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 774         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 775         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 776         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 777         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 778         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 779         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 780         dst+=dstStride;\
 781         src+=srcStride;\
 782     }\
 783 }\
 784 \
 785 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 786     const int w=8;\
 787     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 788     int i;\
 789     for(i=0; i<w; i++)\
 790     {\
 791         const int src0= src[0*srcStride];\
 792         const int src1= src[1*srcStride];\
 793         const int src2= src[2*srcStride];\
 794         const int src3= src[3*srcStride];\
 795         const int src4= src[4*srcStride];\
 796         const int src5= src[5*srcStride];\
 797         const int src6= src[6*srcStride];\
 798         const int src7= src[7*srcStride];\
 799         const int src8= src[8*srcStride];\
 800         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 801         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 802         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 803         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 804         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 805         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 806         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 807         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 808         dst++;\
 809         src++;\
 810     }\
 811 }\
 812 \
 813 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 814     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 815     int i;\
 816     \
 817     for(i=0; i<h; i++)\
 818     {\
 819         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 820         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 821         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 822         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 823         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 824         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 825         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 826         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 827         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 828         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 829         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 830         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 831         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 832         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 833         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 834         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 835         dst+=dstStride;\
 836         src+=srcStride;\
 837     }\
 838 }\
 839 \
 840 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 841     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 842     int i;\
 843     const int w=16;\
 844     for(i=0; i<w; i++)\
 845     {\
 846         const int src0= src[0*srcStride];\
 847         const int src1= src[1*srcStride];\
 848         const int src2= src[2*srcStride];\
 849         const int src3= src[3*srcStride];\
 850         const int src4= src[4*srcStride];\
 851         const int src5= src[5*srcStride];\
 852         const int src6= src[6*srcStride];\
 853         const int src7= src[7*srcStride];\
 854         const int src8= src[8*srcStride];\
 855         const int src9= src[9*srcStride];\
 856         const int src10= src[10*srcStride];\
 857         const int src11= src[11*srcStride];\
 858         const int src12= src[12*srcStride];\
 859         const int src13= src[13*srcStride];\
 860         const int src14= src[14*srcStride];\
 861         const int src15= src[15*srcStride];\
 862         const int src16= src[16*srcStride];\
 863         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 864         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 865         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 866         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 867         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 868         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 869         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 870         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 871         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 872         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 873         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 874         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 875         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 876         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 877         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 878         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 879         dst++;\
 880         src++;\
 881     }\
 882 }\
 883 \
 884 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
 885     uint8_t half[64];\
 886     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 887     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 888 }\
 889 \
 890 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
 891     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 892 }\
 893 \
 894 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
 895     uint8_t half[64];\
 896     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 897     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 898 }\
 899 \
 900 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
 901     uint8_t full[16*9];\
 902     uint8_t half[64];\
 903     copy_block9(full, src, 16, stride, 9);\
 904     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 905     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 906 }\
 907 \
 908 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
 909     uint8_t full[16*9];\
 910     copy_block9(full, src, 16, stride, 9);\
 911     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 912 }\
 913 \
 914 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
 915     uint8_t full[16*9];\
 916     uint8_t half[64];\
 917     copy_block9(full, src, 16, stride, 9);\
 918     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 919     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 920 }\
 921 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
 922     uint8_t full[16*9];\
 923     uint8_t halfH[72];\
 924     uint8_t halfV[64];\
 925     uint8_t halfHV[64];\
 926     copy_block9(full, src, 16, stride, 9);\
 927     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 928     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 929     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 930     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 931 }\
 932 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
 933     uint8_t full[16*9];\
 934     uint8_t halfH[72];\
 935     uint8_t halfHV[64];\
 936     copy_block9(full, src, 16, stride, 9);\
 937     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 938     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 939     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 940     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 941 }\
 942 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
 943     uint8_t full[16*9];\
 944     uint8_t halfH[72];\
 945     uint8_t halfV[64];\
 946     uint8_t halfHV[64];\
 947     copy_block9(full, src, 16, stride, 9);\
 948     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 949     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 950     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 951     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 952 }\
 953 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
 954     uint8_t full[16*9];\
 955     uint8_t halfH[72];\
 956     uint8_t halfHV[64];\
 957     copy_block9(full, src, 16, stride, 9);\
 958     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 959     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 960     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 961     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 962 }\
 963 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
 964     uint8_t full[16*9];\
 965     uint8_t halfH[72];\
 966     uint8_t halfV[64];\
 967     uint8_t halfHV[64];\
 968     copy_block9(full, src, 16, stride, 9);\
 969     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 970     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 971     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 972     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 973 }\
 974 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
 975     uint8_t full[16*9];\
 976     uint8_t halfH[72];\
 977     uint8_t halfHV[64];\
 978     copy_block9(full, src, 16, stride, 9);\
 979     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 980     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 981     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 982     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 983 }\
 984 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
 985     uint8_t full[16*9];\
 986     uint8_t halfH[72];\
 987     uint8_t halfV[64];\
 988     uint8_t halfHV[64];\
 989     copy_block9(full, src, 16, stride, 9);\
 990     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
 991     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 992     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 993     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 994 }\
 995 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
 996     uint8_t full[16*9];\
 997     uint8_t halfH[72];\
 998     uint8_t halfHV[64];\
 999     copy_block9(full, src, 16, stride, 9);\
1000     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1001     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1002     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1003     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1004 }\
1005 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1006     uint8_t halfH[72];\
1007     uint8_t halfHV[64];\
1008     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1009     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1010     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1011 }\
1012 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1013     uint8_t halfH[72];\
1014     uint8_t halfHV[64];\
1015     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1016     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1017     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1018 }\
1019 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1020     uint8_t full[16*9];\
1021     uint8_t halfH[72];\
1022     uint8_t halfV[64];\
1023     uint8_t halfHV[64];\
1024     copy_block9(full, src, 16, stride, 9);\
1025     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1026     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1027     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1028     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1029 }\
1030 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1031     uint8_t full[16*9];\
1032     uint8_t halfH[72];\
1033     copy_block9(full, src, 16, stride, 9);\
1034     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1035     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1036     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1037 }\
1038 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1039     uint8_t full[16*9];\
1040     uint8_t halfH[72];\
1041     uint8_t halfV[64];\
1042     uint8_t halfHV[64];\
1043     copy_block9(full, src, 16, stride, 9);\
1044     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1045     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1046     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1047     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1048 }\
1049 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1050     uint8_t full[16*9];\
1051     uint8_t halfH[72];\
1052     copy_block9(full, src, 16, stride, 9);\
1053     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1054     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1055     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1056 }\
1057 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1058     uint8_t halfH[72];\
1059     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1060     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1061 }\
1062 \
1063 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1064     uint8_t half[256];\
1065     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1066     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1067 }\
1068 \
1069 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1070     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1071 }\
1072 \
1073 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1074     uint8_t half[256];\
1075     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1076     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1077 }\
1078 \
1079 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1080     uint8_t full[24*17];\
1081     uint8_t half[256];\
1082     copy_block17(full, src, 24, stride, 17);\
1083     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1084     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1085 }\
1086 \
1087 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1088     uint8_t full[24*17];\
1089     copy_block17(full, src, 24, stride, 17);\
1090     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1091 }\
1092 \
1093 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1094     uint8_t full[24*17];\
1095     uint8_t half[256];\
1096     copy_block17(full, src, 24, stride, 17);\
1097     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1098     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1099 }\
1100 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1101     uint8_t full[24*17];\
1102     uint8_t halfH[272];\
1103     uint8_t halfV[256];\
1104     uint8_t halfHV[256];\
1105     copy_block17(full, src, 24, stride, 17);\
1106     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1107     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1108     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1109     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1110 }\
1111 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1112     uint8_t full[24*17];\
1113     uint8_t halfH[272];\
1114     uint8_t halfHV[256];\
1115     copy_block17(full, src, 24, stride, 17);\
1116     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1117     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1118     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1119     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1120 }\
1121 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1122     uint8_t full[24*17];\
1123     uint8_t halfH[272];\
1124     uint8_t halfV[256];\
1125     uint8_t halfHV[256];\
1126     copy_block17(full, src, 24, stride, 17);\
1127     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1128     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1129     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1130     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1131 }\
1132 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1133     uint8_t full[24*17];\
1134     uint8_t halfH[272];\
1135     uint8_t halfHV[256];\
1136     copy_block17(full, src, 24, stride, 17);\
1137     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1138     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1139     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1140     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1141 }\
1142 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1143     uint8_t full[24*17];\
1144     uint8_t halfH[272];\
1145     uint8_t halfV[256];\
1146     uint8_t halfHV[256];\
1147     copy_block17(full, src, 24, stride, 17);\
1148     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1149     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1150     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1151     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1152 }\
1153 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1154     uint8_t full[24*17];\
1155     uint8_t halfH[272];\
1156     uint8_t halfHV[256];\
1157     copy_block17(full, src, 24, stride, 17);\
1158     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1159     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1160     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1161     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1162 }\
1163 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1164     uint8_t full[24*17];\
1165     uint8_t halfH[272];\
1166     uint8_t halfV[256];\
1167     uint8_t halfHV[256];\
1168     copy_block17(full, src, 24, stride, 17);\
1169     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1170     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1171     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1172     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1173 }\
1174 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1175     uint8_t full[24*17];\
1176     uint8_t halfH[272];\
1177     uint8_t halfHV[256];\
1178     copy_block17(full, src, 24, stride, 17);\
1179     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1180     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1181     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1182     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1183 }\
1184 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1185     uint8_t halfH[272];\
1186     uint8_t halfHV[256];\
1187     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1188     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1189     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1190 }\
1191 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1192     uint8_t halfH[272];\
1193     uint8_t halfHV[256];\
1194     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1195     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1196     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1197 }\
1198 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1199     uint8_t full[24*17];\
1200     uint8_t halfH[272];\
1201     uint8_t halfV[256];\
1202     uint8_t halfHV[256];\
1203     copy_block17(full, src, 24, stride, 17);\
1204     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1205     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1206     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1207     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1208 }\
1209 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1210     uint8_t full[24*17];\
1211     uint8_t halfH[272];\
1212     copy_block17(full, src, 24, stride, 17);\
1213     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1214     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1215     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1216 }\
1217 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1218     uint8_t full[24*17];\
1219     uint8_t halfH[272];\
1220     uint8_t halfV[256];\
1221     uint8_t halfHV[256];\
1222     copy_block17(full, src, 24, stride, 17);\
1223     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1224     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1225     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1226     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1227 }\
1228 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1229     uint8_t full[24*17];\
1230     uint8_t halfH[272];\
1231     copy_block17(full, src, 24, stride, 17);\
1232     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1233     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1234     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1235 }\
1236 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1237     uint8_t halfH[272];\
1238     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1239     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1240 }
1241
1242 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1243 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1244 #define op_put(a, b) a = cm[((b) + 16)>>5]
1245 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1246
1247 QPEL_MC(0, put_       , _       , op_put)
1248 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1249 QPEL_MC(0, avg_       , _       , op_avg)
1250 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1251 #undef op_avg
1252 #undef op_avg_no_rnd
1253 #undef op_put
1254 #undef op_put_no_rnd
1255
1256 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1257 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1258 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1259 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1260 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1261 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1262
1263 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1264     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1265     int i;
1266
1267     for(i=0; i<h; i++){
1268         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1269         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1270         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1271         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1272         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1273         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1274         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1275         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1276         dst+=dstStride;
1277         src+=srcStride;
1278     }
1279 }
1280
1281 #if CONFIG_RV40_DECODER
1282 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1283     put_pixels16_xy2_8_c(dst, src, stride, 16);
1284 }
1285 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1286     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1287 }
1288 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1289     put_pixels8_xy2_8_c(dst, src, stride, 8);
1290 }
1291 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1292     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1293 }
1294 #endif /* CONFIG_RV40_DECODER */
1295
1296 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1297     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1298     int i;
1299
1300     for(i=0; i<w; i++){
1301         const int src_1= src[ -srcStride];
1302         const int src0 = src[0          ];
1303         const int src1 = src[  srcStride];
1304         const int src2 = src[2*srcStride];
1305         const int src3 = src[3*srcStride];
1306         const int src4 = src[4*srcStride];
1307         const int src5 = src[5*srcStride];
1308         const int src6 = src[6*srcStride];
1309         const int src7 = src[7*srcStride];
1310         const int src8 = src[8*srcStride];
1311         const int src9 = src[9*srcStride];
1312         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1313         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1314         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1315         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1316         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1317         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1318         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1319         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1320         src++;
1321         dst++;
1322     }
1323 }
1324
1325 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1326     uint8_t half[64];
1327     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1328     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1329 }
1330
1331 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1332     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1333 }
1334
1335 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1336     uint8_t half[64];
1337     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1338     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1339 }
1340
1341 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1342     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1343 }
1344
1345 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1346     uint8_t halfH[88];
1347     uint8_t halfV[64];
1348     uint8_t halfHV[64];
1349     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1350     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1351     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1352     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1353 }
1354 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1355     uint8_t halfH[88];
1356     uint8_t halfV[64];
1357     uint8_t halfHV[64];
1358     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1359     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1360     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1361     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1362 }
1363 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1364     uint8_t halfH[88];
1365     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1366     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1367 }
1368
1369 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1370     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1371     int x;
1372     const int strength= ff_h263_loop_filter_strength[qscale];
1373
1374     for(x=0; x<8; x++){
1375         int d1, d2, ad1;
1376         int p0= src[x-2*stride];
1377         int p1= src[x-1*stride];
1378         int p2= src[x+0*stride];
1379         int p3= src[x+1*stride];
1380         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1381
1382         if     (d<-2*strength) d1= 0;
1383         else if(d<-  strength) d1=-2*strength - d;
1384         else if(d<   strength) d1= d;
1385         else if(d< 2*strength) d1= 2*strength - d;
1386         else                   d1= 0;
1387
1388         p1 += d1;
1389         p2 -= d1;
1390         if(p1&256) p1= ~(p1>>31);
1391         if(p2&256) p2= ~(p2>>31);
1392
1393         src[x-1*stride] = p1;
1394         src[x+0*stride] = p2;
1395
1396         ad1= FFABS(d1)>>1;
1397
1398         d2= av_clip((p0-p3)/4, -ad1, ad1);
1399
1400         src[x-2*stride] = p0 - d2;
1401         src[x+  stride] = p3 + d2;
1402     }
1403     }
1404 }
1405
1406 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1407     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1408     int y;
1409     const int strength= ff_h263_loop_filter_strength[qscale];
1410
1411     for(y=0; y<8; y++){
1412         int d1, d2, ad1;
1413         int p0= src[y*stride-2];
1414         int p1= src[y*stride-1];
1415         int p2= src[y*stride+0];
1416         int p3= src[y*stride+1];
1417         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1418
1419         if     (d<-2*strength) d1= 0;
1420         else if(d<-  strength) d1=-2*strength - d;
1421         else if(d<   strength) d1= d;
1422         else if(d< 2*strength) d1= 2*strength - d;
1423         else                   d1= 0;
1424
1425         p1 += d1;
1426         p2 -= d1;
1427         if(p1&256) p1= ~(p1>>31);
1428         if(p2&256) p2= ~(p2>>31);
1429
1430         src[y*stride-1] = p1;
1431         src[y*stride+0] = p2;
1432
1433         ad1= FFABS(d1)>>1;
1434
1435         d2= av_clip((p0-p3)/4, -ad1, ad1);
1436
1437         src[y*stride-2] = p0 - d2;
1438         src[y*stride+1] = p3 + d2;
1439     }
1440     }
1441 }
1442
1443 static void h261_loop_filter_c(uint8_t *src, int stride){
1444     int x,y,xy,yz;
1445     int temp[64];
1446
1447     for(x=0; x<8; x++){
1448         temp[x      ] = 4*src[x           ];
1449         temp[x + 7*8] = 4*src[x + 7*stride];
1450     }
1451     for(y=1; y<7; y++){
1452         for(x=0; x<8; x++){
1453             xy = y * stride + x;
1454             yz = y * 8 + x;
1455             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1456         }
1457     }
1458
1459     for(y=0; y<8; y++){
1460         src[  y*stride] = (temp[  y*8] + 2)>>2;
1461         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1462         for(x=1; x<7; x++){
1463             xy = y * stride + x;
1464             yz = y * 8 + x;
1465             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1466         }
1467     }
1468 }
1469
1470 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1471 {
1472     int s, i;
1473
1474     s = 0;
1475     for(i=0;i<h;i++) {
1476         s += abs(pix1[0] - pix2[0]);
1477         s += abs(pix1[1] - pix2[1]);
1478         s += abs(pix1[2] - pix2[2]);
1479         s += abs(pix1[3] - pix2[3]);
1480         s += abs(pix1[4] - pix2[4]);
1481         s += abs(pix1[5] - pix2[5]);
1482         s += abs(pix1[6] - pix2[6]);
1483         s += abs(pix1[7] - pix2[7]);
1484         s += abs(pix1[8] - pix2[8]);
1485         s += abs(pix1[9] - pix2[9]);
1486         s += abs(pix1[10] - pix2[10]);
1487         s += abs(pix1[11] - pix2[11]);
1488         s += abs(pix1[12] - pix2[12]);
1489         s += abs(pix1[13] - pix2[13]);
1490         s += abs(pix1[14] - pix2[14]);
1491         s += abs(pix1[15] - pix2[15]);
1492         pix1 += line_size;
1493         pix2 += line_size;
1494     }
1495     return s;
1496 }
1497
1498 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1499 {
1500     int s, i;
1501
1502     s = 0;
1503     for(i=0;i<h;i++) {
1504         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1505         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1506         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1507         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1508         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1509         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1510         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1511         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1512         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1513         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1514         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1515         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1516         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1517         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1518         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1519         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1520         pix1 += line_size;
1521         pix2 += line_size;
1522     }
1523     return s;
1524 }
1525
1526 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1527 {
1528     int s, i;
1529     uint8_t *pix3 = pix2 + line_size;
1530
1531     s = 0;
1532     for(i=0;i<h;i++) {
1533         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1534         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1535         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1536         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1537         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1538         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1539         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1540         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1541         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1542         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1543         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1544         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1545         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1546         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1547         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1548         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1549         pix1 += line_size;
1550         pix2 += line_size;
1551         pix3 += line_size;
1552     }
1553     return s;
1554 }
1555
1556 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1557 {
1558     int s, i;
1559     uint8_t *pix3 = pix2 + line_size;
1560
1561     s = 0;
1562     for(i=0;i<h;i++) {
1563         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1564         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1565         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1566         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1567         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1568         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1569         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1570         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1571         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1572         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1573         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1574         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1575         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1576         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1577         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1578         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1579         pix1 += line_size;
1580         pix2 += line_size;
1581         pix3 += line_size;
1582     }
1583     return s;
1584 }
1585
1586 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1587 {
1588     int s, i;
1589
1590     s = 0;
1591     for(i=0;i<h;i++) {
1592         s += abs(pix1[0] - pix2[0]);
1593         s += abs(pix1[1] - pix2[1]);
1594         s += abs(pix1[2] - pix2[2]);
1595         s += abs(pix1[3] - pix2[3]);
1596         s += abs(pix1[4] - pix2[4]);
1597         s += abs(pix1[5] - pix2[5]);
1598         s += abs(pix1[6] - pix2[6]);
1599         s += abs(pix1[7] - pix2[7]);
1600         pix1 += line_size;
1601         pix2 += line_size;
1602     }
1603     return s;
1604 }
1605
1606 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1607 {
1608     int s, i;
1609
1610     s = 0;
1611     for(i=0;i<h;i++) {
1612         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1613         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1614         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1615         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1616         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1617         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1618         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1619         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1620         pix1 += line_size;
1621         pix2 += line_size;
1622     }
1623     return s;
1624 }
1625
1626 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1627 {
1628     int s, i;
1629     uint8_t *pix3 = pix2 + line_size;
1630
1631     s = 0;
1632     for(i=0;i<h;i++) {
1633         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1634         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1635         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1636         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1637         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1638         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1639         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1640         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1641         pix1 += line_size;
1642         pix2 += line_size;
1643         pix3 += line_size;
1644     }
1645     return s;
1646 }
1647
1648 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1649 {
1650     int s, i;
1651     uint8_t *pix3 = pix2 + line_size;
1652
1653     s = 0;
1654     for(i=0;i<h;i++) {
1655         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1656         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1657         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1658         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1659         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1660         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1661         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1662         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1663         pix1 += line_size;
1664         pix2 += line_size;
1665         pix3 += line_size;
1666     }
1667     return s;
1668 }
1669
1670 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1671     MpegEncContext *c = v;
1672     int score1=0;
1673     int score2=0;
1674     int x,y;
1675
1676     for(y=0; y<h; y++){
1677         for(x=0; x<16; x++){
1678             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1679         }
1680         if(y+1<h){
1681             for(x=0; x<15; x++){
1682                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1683                              - s1[x+1] + s1[x+1+stride])
1684                         -FFABS(  s2[x  ] - s2[x  +stride]
1685                              - s2[x+1] + s2[x+1+stride]);
1686             }
1687         }
1688         s1+= stride;
1689         s2+= stride;
1690     }
1691
1692     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1693     else  return score1 + FFABS(score2)*8;
1694 }
1695
1696 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1697     MpegEncContext *c = v;
1698     int score1=0;
1699     int score2=0;
1700     int x,y;
1701
1702     for(y=0; y<h; y++){
1703         for(x=0; x<8; x++){
1704             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1705         }
1706         if(y+1<h){
1707             for(x=0; x<7; x++){
1708                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1709                              - s1[x+1] + s1[x+1+stride])
1710                         -FFABS(  s2[x  ] - s2[x  +stride]
1711                              - s2[x+1] + s2[x+1+stride]);
1712             }
1713         }
1714         s1+= stride;
1715         s2+= stride;
1716     }
1717
1718     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1719     else  return score1 + FFABS(score2)*8;
1720 }
1721
1722 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1723     int i;
1724     unsigned int sum=0;
1725
1726     for(i=0; i<8*8; i++){
1727         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1728         int w= weight[i];
1729         b>>= RECON_SHIFT;
1730         assert(-512<b && b<512);
1731
1732         sum += (w*b)*(w*b)>>4;
1733     }
1734     return sum>>2;
1735 }
1736
1737 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1738     int i;
1739
1740     for(i=0; i<8*8; i++){
1741         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1742     }
1743 }
1744
1745 /**
1746  * permutes an 8x8 block.
1747  * @param block the block which will be permuted according to the given permutation vector
1748  * @param permutation the permutation vector
1749  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1750  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1751  *                  (inverse) permutated to scantable order!
1752  */
1753 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1754 {
1755     int i;
1756     DCTELEM temp[64];
1757
1758     if(last<=0) return;
1759     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1760
1761     for(i=0; i<=last; i++){
1762         const int j= scantable[i];
1763         temp[j]= block[j];
1764         block[j]=0;
1765     }
1766
1767     for(i=0; i<=last; i++){
1768         const int j= scantable[i];
1769         const int perm_j= permutation[j];
1770         block[perm_j]= temp[j];
1771     }
1772 }
1773
1774 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1775     return 0;
1776 }
1777
1778 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1779     int i;
1780
1781     memset(cmp, 0, sizeof(void*)*6);
1782
1783     for(i=0; i<6; i++){
1784         switch(type&0xFF){
1785         case FF_CMP_SAD:
1786             cmp[i]= c->sad[i];
1787             break;
1788         case FF_CMP_SATD:
1789             cmp[i]= c->hadamard8_diff[i];
1790             break;
1791         case FF_CMP_SSE:
1792             cmp[i]= c->sse[i];
1793             break;
1794         case FF_CMP_DCT:
1795             cmp[i]= c->dct_sad[i];
1796             break;
1797         case FF_CMP_DCT264:
1798             cmp[i]= c->dct264_sad[i];
1799             break;
1800         case FF_CMP_DCTMAX:
1801             cmp[i]= c->dct_max[i];
1802             break;
1803         case FF_CMP_PSNR:
1804             cmp[i]= c->quant_psnr[i];
1805             break;
1806         case FF_CMP_BIT:
1807             cmp[i]= c->bit[i];
1808             break;
1809         case FF_CMP_RD:
1810             cmp[i]= c->rd[i];
1811             break;
1812         case FF_CMP_VSAD:
1813             cmp[i]= c->vsad[i];
1814             break;
1815         case FF_CMP_VSSE:
1816             cmp[i]= c->vsse[i];
1817             break;
1818         case FF_CMP_ZERO:
1819             cmp[i]= zero_cmp;
1820             break;
1821         case FF_CMP_NSSE:
1822             cmp[i]= c->nsse[i];
1823             break;
1824 #if CONFIG_DWT
1825         case FF_CMP_W53:
1826             cmp[i]= c->w53[i];
1827             break;
1828         case FF_CMP_W97:
1829             cmp[i]= c->w97[i];
1830             break;
1831 #endif
1832         default:
1833             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1834         }
1835     }
1836 }
1837
1838 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1839     long i;
1840     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1841         long a = *(long*)(src+i);
1842         long b = *(long*)(dst+i);
1843         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1844     }
1845     for(; i<w; i++)
1846         dst[i+0] += src[i+0];
1847 }
1848
1849 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1850     long i;
1851 #if !HAVE_FAST_UNALIGNED
1852     if((long)src2 & (sizeof(long)-1)){
1853         for(i=0; i+7<w; i+=8){
1854             dst[i+0] = src1[i+0]-src2[i+0];
1855             dst[i+1] = src1[i+1]-src2[i+1];
1856             dst[i+2] = src1[i+2]-src2[i+2];
1857             dst[i+3] = src1[i+3]-src2[i+3];
1858             dst[i+4] = src1[i+4]-src2[i+4];
1859             dst[i+5] = src1[i+5]-src2[i+5];
1860             dst[i+6] = src1[i+6]-src2[i+6];
1861             dst[i+7] = src1[i+7]-src2[i+7];
1862         }
1863     }else
1864 #endif
1865     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1866         long a = *(long*)(src1+i);
1867         long b = *(long*)(src2+i);
1868         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1869     }
1870     for(; i<w; i++)
1871         dst[i+0] = src1[i+0]-src2[i+0];
1872 }
1873
1874 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1875     int i;
1876     uint8_t l, lt;
1877
1878     l= *left;
1879     lt= *left_top;
1880
1881     for(i=0; i<w; i++){
1882         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1883         lt= src1[i];
1884         dst[i]= l;
1885     }
1886
1887     *left= l;
1888     *left_top= lt;
1889 }
1890
1891 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1892     int i;
1893     uint8_t l, lt;
1894
1895     l= *left;
1896     lt= *left_top;
1897
1898     for(i=0; i<w; i++){
1899         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1900         lt= src1[i];
1901         l= src2[i];
1902         dst[i]= l - pred;
1903     }
1904
1905     *left= l;
1906     *left_top= lt;
1907 }
1908
1909 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1910     int i;
1911
1912     for(i=0; i<w-1; i++){
1913         acc+= src[i];
1914         dst[i]= acc;
1915         i++;
1916         acc+= src[i];
1917         dst[i]= acc;
1918     }
1919
1920     for(; i<w; i++){
1921         acc+= src[i];
1922         dst[i]= acc;
1923     }
1924
1925     return acc;
1926 }
1927
1928 #if HAVE_BIGENDIAN
1929 #define B 3
1930 #define G 2
1931 #define R 1
1932 #define A 0
1933 #else
1934 #define B 0
1935 #define G 1
1936 #define R 2
1937 #define A 3
1938 #endif
1939 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1940     int i;
1941     int r,g,b,a;
1942     r= *red;
1943     g= *green;
1944     b= *blue;
1945     a= *alpha;
1946
1947     for(i=0; i<w; i++){
1948         b+= src[4*i+B];
1949         g+= src[4*i+G];
1950         r+= src[4*i+R];
1951         a+= src[4*i+A];
1952
1953         dst[4*i+B]= b;
1954         dst[4*i+G]= g;
1955         dst[4*i+R]= r;
1956         dst[4*i+A]= a;
1957     }
1958
1959     *red= r;
1960     *green= g;
1961     *blue= b;
1962     *alpha= a;
1963 }
1964 #undef B
1965 #undef G
1966 #undef R
1967 #undef A
1968
1969 #define BUTTERFLY2(o1,o2,i1,i2) \
1970 o1= (i1)+(i2);\
1971 o2= (i1)-(i2);
1972
1973 #define BUTTERFLY1(x,y) \
1974 {\
1975     int a,b;\
1976     a= x;\
1977     b= y;\
1978     x= a+b;\
1979     y= a-b;\
1980 }
1981
1982 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1983
1984 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1985     int i;
1986     int temp[64];
1987     int sum=0;
1988
1989     assert(h==8);
1990
1991     for(i=0; i<8; i++){
1992         //FIXME try pointer walks
1993         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1994         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1995         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1996         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1997
1998         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1999         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2000         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2001         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2002
2003         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2004         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2005         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2006         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2007     }
2008
2009     for(i=0; i<8; i++){
2010         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2011         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2012         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2013         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2014
2015         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2016         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2017         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2018         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2019
2020         sum +=
2021              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2022             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2023             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2024             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2025     }
2026     return sum;
2027 }
2028
2029 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2030     int i;
2031     int temp[64];
2032     int sum=0;
2033
2034     assert(h==8);
2035
2036     for(i=0; i<8; i++){
2037         //FIXME try pointer walks
2038         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2039         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2040         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2041         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2042
2043         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2044         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2045         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2046         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2047
2048         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2049         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2050         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2051         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2052     }
2053
2054     for(i=0; i<8; i++){
2055         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2056         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2057         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2058         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2059
2060         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2061         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2062         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2063         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2064
2065         sum +=
2066              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2067             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2068             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2069             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2070     }
2071
2072     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2073
2074     return sum;
2075 }
2076
2077 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2078     MpegEncContext * const s= (MpegEncContext *)c;
2079     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2080
2081     assert(h==8);
2082
2083     s->dsp.diff_pixels(temp, src1, src2, stride);
2084     s->dsp.fdct(temp);
2085     return s->dsp.sum_abs_dctelem(temp);
2086 }
2087
2088 #if CONFIG_GPL
2089 #define DCT8_1D {\
2090     const int s07 = SRC(0) + SRC(7);\
2091     const int s16 = SRC(1) + SRC(6);\
2092     const int s25 = SRC(2) + SRC(5);\
2093     const int s34 = SRC(3) + SRC(4);\
2094     const int a0 = s07 + s34;\
2095     const int a1 = s16 + s25;\
2096     const int a2 = s07 - s34;\
2097     const int a3 = s16 - s25;\
2098     const int d07 = SRC(0) - SRC(7);\
2099     const int d16 = SRC(1) - SRC(6);\
2100     const int d25 = SRC(2) - SRC(5);\
2101     const int d34 = SRC(3) - SRC(4);\
2102     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2103     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2104     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2105     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2106     DST(0,  a0 + a1     ) ;\
2107     DST(1,  a4 + (a7>>2)) ;\
2108     DST(2,  a2 + (a3>>1)) ;\
2109     DST(3,  a5 + (a6>>2)) ;\
2110     DST(4,  a0 - a1     ) ;\
2111     DST(5,  a6 - (a5>>2)) ;\
2112     DST(6, (a2>>1) - a3 ) ;\
2113     DST(7, (a4>>2) - a7 ) ;\
2114 }
2115
2116 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2117     MpegEncContext * const s= (MpegEncContext *)c;
2118     DCTELEM dct[8][8];
2119     int i;
2120     int sum=0;
2121
2122     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2123
2124 #define SRC(x) dct[i][x]
2125 #define DST(x,v) dct[i][x]= v
2126     for( i = 0; i < 8; i++ )
2127         DCT8_1D
2128 #undef SRC
2129 #undef DST
2130
2131 #define SRC(x) dct[x][i]
2132 #define DST(x,v) sum += FFABS(v)
2133     for( i = 0; i < 8; i++ )
2134         DCT8_1D
2135 #undef SRC
2136 #undef DST
2137     return sum;
2138 }
2139 #endif
2140
2141 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2142     MpegEncContext * const s= (MpegEncContext *)c;
2143     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2144     int sum=0, i;
2145
2146     assert(h==8);
2147
2148     s->dsp.diff_pixels(temp, src1, src2, stride);
2149     s->dsp.fdct(temp);
2150
2151     for(i=0; i<64; i++)
2152         sum= FFMAX(sum, FFABS(temp[i]));
2153
2154     return sum;
2155 }
2156
2157 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2158     MpegEncContext * const s= (MpegEncContext *)c;
2159     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2160     DCTELEM * const bak = temp+64;
2161     int sum=0, i;
2162
2163     assert(h==8);
2164     s->mb_intra=0;
2165
2166     s->dsp.diff_pixels(temp, src1, src2, stride);
2167
2168     memcpy(bak, temp, 64*sizeof(DCTELEM));
2169
2170     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2171     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2172     ff_simple_idct_8(temp); //FIXME
2173
2174     for(i=0; i<64; i++)
2175         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2176
2177     return sum;
2178 }
2179
2180 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2181     MpegEncContext * const s= (MpegEncContext *)c;
2182     const uint8_t *scantable= s->intra_scantable.permutated;
2183     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2184     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2185     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2186     int i, last, run, bits, level, distortion, start_i;
2187     const int esc_length= s->ac_esc_length;
2188     uint8_t * length;
2189     uint8_t * last_length;
2190
2191     assert(h==8);
2192
2193     copy_block8(lsrc1, src1, 8, stride, 8);
2194     copy_block8(lsrc2, src2, 8, stride, 8);
2195
2196     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2197
2198     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2199
2200     bits=0;
2201
2202     if (s->mb_intra) {
2203         start_i = 1;
2204         length     = s->intra_ac_vlc_length;
2205         last_length= s->intra_ac_vlc_last_length;
2206         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2207     } else {
2208         start_i = 0;
2209         length     = s->inter_ac_vlc_length;
2210         last_length= s->inter_ac_vlc_last_length;
2211     }
2212
2213     if(last>=start_i){
2214         run=0;
2215         for(i=start_i; i<last; i++){
2216             int j= scantable[i];
2217             level= temp[j];
2218
2219             if(level){
2220                 level+=64;
2221                 if((level&(~127)) == 0){
2222                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2223                 }else
2224                     bits+= esc_length;
2225                 run=0;
2226             }else
2227                 run++;
2228         }
2229         i= scantable[last];
2230
2231         level= temp[i] + 64;
2232
2233         assert(level - 64);
2234
2235         if((level&(~127)) == 0){
2236             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2237         }else
2238             bits+= esc_length;
2239
2240     }
2241
2242     if(last>=0){
2243         if(s->mb_intra)
2244             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2245         else
2246             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2247     }
2248
2249     s->dsp.idct_add(lsrc2, 8, temp);
2250
2251     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2252
2253     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2254 }
2255
2256 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2257     MpegEncContext * const s= (MpegEncContext *)c;
2258     const uint8_t *scantable= s->intra_scantable.permutated;
2259     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2260     int i, last, run, bits, level, start_i;
2261     const int esc_length= s->ac_esc_length;
2262     uint8_t * length;
2263     uint8_t * last_length;
2264
2265     assert(h==8);
2266
2267     s->dsp.diff_pixels(temp, src1, src2, stride);
2268
2269     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2270
2271     bits=0;
2272
2273     if (s->mb_intra) {
2274         start_i = 1;
2275         length     = s->intra_ac_vlc_length;
2276         last_length= s->intra_ac_vlc_last_length;
2277         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2278     } else {
2279         start_i = 0;
2280         length     = s->inter_ac_vlc_length;
2281         last_length= s->inter_ac_vlc_last_length;
2282     }
2283
2284     if(last>=start_i){
2285         run=0;
2286         for(i=start_i; i<last; i++){
2287             int j= scantable[i];
2288             level= temp[j];
2289
2290             if(level){
2291                 level+=64;
2292                 if((level&(~127)) == 0){
2293                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2294                 }else
2295                     bits+= esc_length;
2296                 run=0;
2297             }else
2298                 run++;
2299         }
2300         i= scantable[last];
2301
2302         level= temp[i] + 64;
2303
2304         assert(level - 64);
2305
2306         if((level&(~127)) == 0){
2307             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2308         }else
2309             bits+= esc_length;
2310     }
2311
2312     return bits;
2313 }
2314
2315 #define VSAD_INTRA(size) \
2316 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2317     int score=0;                                                                                            \
2318     int x,y;                                                                                                \
2319                                                                                                             \
2320     for(y=1; y<h; y++){                                                                                     \
2321         for(x=0; x<size; x+=4){                                                                             \
2322             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2323                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2324         }                                                                                                   \
2325         s+= stride;                                                                                         \
2326     }                                                                                                       \
2327                                                                                                             \
2328     return score;                                                                                           \
2329 }
2330 VSAD_INTRA(8)
2331 VSAD_INTRA(16)
2332
2333 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2334     int score=0;
2335     int x,y;
2336
2337     for(y=1; y<h; y++){
2338         for(x=0; x<16; x++){
2339             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2340         }
2341         s1+= stride;
2342         s2+= stride;
2343     }
2344
2345     return score;
2346 }
2347
2348 #define SQ(a) ((a)*(a))
2349 #define VSSE_INTRA(size) \
2350 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2351     int score=0;                                                                                            \
2352     int x,y;                                                                                                \
2353                                                                                                             \
2354     for(y=1; y<h; y++){                                                                                     \
2355         for(x=0; x<size; x+=4){                                                                               \
2356             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2357                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2358         }                                                                                                   \
2359         s+= stride;                                                                                         \
2360     }                                                                                                       \
2361                                                                                                             \
2362     return score;                                                                                           \
2363 }
2364 VSSE_INTRA(8)
2365 VSSE_INTRA(16)
2366
2367 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2368     int score=0;
2369     int x,y;
2370
2371     for(y=1; y<h; y++){
2372         for(x=0; x<16; x++){
2373             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2374         }
2375         s1+= stride;
2376         s2+= stride;
2377     }
2378
2379     return score;
2380 }
2381
2382 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2383                                int size){
2384     int score=0;
2385     int i;
2386     for(i=0; i<size; i++)
2387         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2388     return score;
2389 }
2390
2391 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2392 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2393 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2394 #if CONFIG_GPL
2395 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2396 #endif
2397 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2398 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2399 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2400 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2401
2402 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2403     int i;
2404     for(i=0; i<len; i++)
2405         dst[i] = src0[i] * src1[i];
2406 }
2407
2408 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2409     int i;
2410     src1 += len-1;
2411     for(i=0; i<len; i++)
2412         dst[i] = src0[i] * src1[-i];
2413 }
2414
2415 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2416     int i;
2417     for(i=0; i<len; i++)
2418         dst[i] = src0[i] * src1[i] + src2[i];
2419 }
2420
2421 static void vector_fmul_window_c(float *dst, const float *src0,
2422                                  const float *src1, const float *win, int len)
2423 {
2424     int i,j;
2425     dst += len;
2426     win += len;
2427     src0+= len;
2428     for(i=-len, j=len-1; i<0; i++, j--) {
2429         float s0 = src0[i];
2430         float s1 = src1[j];
2431         float wi = win[i];
2432         float wj = win[j];
2433         dst[i] = s0*wj - s1*wi;
2434         dst[j] = s0*wi + s1*wj;
2435     }
2436 }
2437
2438 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2439                                  int len)
2440 {
2441     int i;
2442     for (i = 0; i < len; i++)
2443         dst[i] = src[i] * mul;
2444 }
2445
2446 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2447                                 int len)
2448 {
2449     int i;
2450     for (i = 0; i < len; i++) {
2451         float t = v1[i] - v2[i];
2452         v1[i] += v2[i];
2453         v2[i] = t;
2454     }
2455 }
2456
2457 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2458 {
2459     float p = 0.0;
2460     int i;
2461
2462     for (i = 0; i < len; i++)
2463         p += v1[i] * v2[i];
2464
2465     return p;
2466 }
2467
2468 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2469                    uint32_t maxi, uint32_t maxisign)
2470 {
2471
2472     if(a > mini) return mini;
2473     else if((a^(1U<<31)) > maxisign) return maxi;
2474     else return a;
2475 }
2476
2477 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2478     int i;
2479     uint32_t mini = *(uint32_t*)min;
2480     uint32_t maxi = *(uint32_t*)max;
2481     uint32_t maxisign = maxi ^ (1U<<31);
2482     uint32_t *dsti = (uint32_t*)dst;
2483     const uint32_t *srci = (const uint32_t*)src;
2484     for(i=0; i<len; i+=8) {
2485         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2486         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2487         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2488         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2489         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2490         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2491         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2492         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2493     }
2494 }
2495 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2496     int i;
2497     if(min < 0 && max > 0) {
2498         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2499     } else {
2500         for(i=0; i < len; i+=8) {
2501             dst[i    ] = av_clipf(src[i    ], min, max);
2502             dst[i + 1] = av_clipf(src[i + 1], min, max);
2503             dst[i + 2] = av_clipf(src[i + 2], min, max);
2504             dst[i + 3] = av_clipf(src[i + 3], min, max);
2505             dst[i + 4] = av_clipf(src[i + 4], min, max);
2506             dst[i + 5] = av_clipf(src[i + 5], min, max);
2507             dst[i + 6] = av_clipf(src[i + 6], min, max);
2508             dst[i + 7] = av_clipf(src[i + 7], min, max);
2509         }
2510     }
2511 }
2512
2513 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2514 {
2515     int res = 0;
2516
2517     while (order--)
2518         res += (*v1++ * *v2++) >> shift;
2519
2520     return res;
2521 }
2522
2523 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2524 {
2525     int res = 0;
2526     while (order--) {
2527         res   += *v1 * *v2++;
2528         *v1++ += mul * *v3++;
2529     }
2530     return res;
2531 }
2532
2533 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2534                                  const int16_t *window, unsigned int len)
2535 {
2536     int i;
2537     int len2 = len >> 1;
2538
2539     for (i = 0; i < len2; i++) {
2540         int16_t w       = window[i];
2541         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2542         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2543     }
2544 }
2545
2546 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2547                                 int32_t max, unsigned int len)
2548 {
2549     do {
2550         *dst++ = av_clip(*src++, min, max);
2551         *dst++ = av_clip(*src++, min, max);
2552         *dst++ = av_clip(*src++, min, max);
2553         *dst++ = av_clip(*src++, min, max);
2554         *dst++ = av_clip(*src++, min, max);
2555         *dst++ = av_clip(*src++, min, max);
2556         *dst++ = av_clip(*src++, min, max);
2557         *dst++ = av_clip(*src++, min, max);
2558         len -= 8;
2559     } while (len > 0);
2560 }
2561
2562 #define W0 2048
2563 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2564 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2565 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2566 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2567 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2568 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2569 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2570
2571 static void wmv2_idct_row(short * b)
2572 {
2573     int s1,s2;
2574     int a0,a1,a2,a3,a4,a5,a6,a7;
2575     /*step 1*/
2576     a1 = W1*b[1]+W7*b[7];
2577     a7 = W7*b[1]-W1*b[7];
2578     a5 = W5*b[5]+W3*b[3];
2579     a3 = W3*b[5]-W5*b[3];
2580     a2 = W2*b[2]+W6*b[6];
2581     a6 = W6*b[2]-W2*b[6];
2582     a0 = W0*b[0]+W0*b[4];
2583     a4 = W0*b[0]-W0*b[4];
2584     /*step 2*/
2585     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2586     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2587     /*step 3*/
2588     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2589     b[1] = (a4+a6 +s1   + (1<<7))>>8;
2590     b[2] = (a4-a6 +s2   + (1<<7))>>8;
2591     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2592     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2593     b[5] = (a4-a6 -s2   + (1<<7))>>8;
2594     b[6] = (a4+a6 -s1   + (1<<7))>>8;
2595     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2596 }
2597 static void wmv2_idct_col(short * b)
2598 {
2599     int s1,s2;
2600     int a0,a1,a2,a3,a4,a5,a6,a7;
2601     /*step 1, with extended precision*/
2602     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2603     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2604     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2605     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2606     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2607     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2608     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2609     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2610     /*step 2*/
2611     s1 = (181*(a1-a5+a7-a3)+128)>>8;
2612     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2613     /*step 3*/
2614     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2615     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2616     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2617     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2618
2619     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2620     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2621     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2622     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2623 }
2624 void ff_wmv2_idct_c(short * block){
2625     int i;
2626
2627     for(i=0;i<64;i+=8){
2628         wmv2_idct_row(block+i);
2629     }
2630     for(i=0;i<8;i++){
2631         wmv2_idct_col(block+i);
2632     }
2633 }
2634 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2635  converted */
2636 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2637 {
2638     ff_wmv2_idct_c(block);
2639     ff_put_pixels_clamped_c(block, dest, line_size);
2640 }
2641 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2642 {
2643     ff_wmv2_idct_c(block);
2644     ff_add_pixels_clamped_c(block, dest, line_size);
2645 }
2646 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2647 {
2648     j_rev_dct (block);
2649     ff_put_pixels_clamped_c(block, dest, line_size);
2650 }
2651 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2652 {
2653     j_rev_dct (block);
2654     ff_add_pixels_clamped_c(block, dest, line_size);
2655 }
2656
2657 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2658 {
2659     j_rev_dct4 (block);
2660     put_pixels_clamped4_c(block, dest, line_size);
2661 }
2662 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2663 {
2664     j_rev_dct4 (block);
2665     add_pixels_clamped4_c(block, dest, line_size);
2666 }
2667
2668 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2669 {
2670     j_rev_dct2 (block);
2671     put_pixels_clamped2_c(block, dest, line_size);
2672 }
2673 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2674 {
2675     j_rev_dct2 (block);
2676     add_pixels_clamped2_c(block, dest, line_size);
2677 }
2678
2679 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2680 {
2681     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2682
2683     dest[0] = cm[(block[0] + 4)>>3];
2684 }
2685 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2686 {
2687     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2688
2689     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2690 }
2691
2692 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2693
2694 /* init static data */
2695 av_cold void dsputil_static_init(void)
2696 {
2697     int i;
2698
2699     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2700     for(i=0;i<MAX_NEG_CROP;i++) {
2701         ff_cropTbl[i] = 0;
2702         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2703     }
2704
2705     for(i=0;i<512;i++) {
2706         ff_squareTbl[i] = (i - 256) * (i - 256);
2707     }
2708
2709     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2710 }
2711
2712 int ff_check_alignment(void){
2713     static int did_fail=0;
2714     LOCAL_ALIGNED_16(int, aligned, [4]);
2715
2716     if((intptr_t)aligned & 15){
2717         if(!did_fail){
2718 #if HAVE_MMX || HAVE_ALTIVEC
2719             av_log(NULL, AV_LOG_ERROR,
2720                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2721                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2722                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2723                 "Do not report crashes to FFmpeg developers.\n");
2724 #endif
2725             did_fail=1;
2726         }
2727         return -1;
2728     }
2729     return 0;
2730 }
2731
2732 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2733 {
2734     int i;
2735
2736     ff_check_alignment();
2737
2738 #if CONFIG_ENCODERS
2739     if (avctx->bits_per_raw_sample == 10) {
2740         c->fdct    = ff_jpeg_fdct_islow_10;
2741         c->fdct248 = ff_fdct248_islow_10;
2742     } else {
2743         if(avctx->dct_algo==FF_DCT_FASTINT) {
2744             c->fdct    = fdct_ifast;
2745             c->fdct248 = fdct_ifast248;
2746         }
2747         else if(avctx->dct_algo==FF_DCT_FAAN) {
2748             c->fdct    = ff_faandct;
2749             c->fdct248 = ff_faandct248;
2750         }
2751         else {
2752             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2753             c->fdct248 = ff_fdct248_islow_8;
2754         }
2755     }
2756 #endif //CONFIG_ENCODERS
2757
2758     if(avctx->lowres==1){
2759         c->idct_put= ff_jref_idct4_put;
2760         c->idct_add= ff_jref_idct4_add;
2761         c->idct    = j_rev_dct4;
2762         c->idct_permutation_type= FF_NO_IDCT_PERM;
2763     }else if(avctx->lowres==2){
2764         c->idct_put= ff_jref_idct2_put;
2765         c->idct_add= ff_jref_idct2_add;
2766         c->idct    = j_rev_dct2;
2767         c->idct_permutation_type= FF_NO_IDCT_PERM;
2768     }else if(avctx->lowres==3){
2769         c->idct_put= ff_jref_idct1_put;
2770         c->idct_add= ff_jref_idct1_add;
2771         c->idct    = j_rev_dct1;
2772         c->idct_permutation_type= FF_NO_IDCT_PERM;
2773     }else{
2774         if (avctx->bits_per_raw_sample == 10) {
2775             c->idct_put              = ff_simple_idct_put_10;
2776             c->idct_add              = ff_simple_idct_add_10;
2777             c->idct                  = ff_simple_idct_10;
2778             c->idct_permutation_type = FF_NO_IDCT_PERM;
2779         } else {
2780         if(avctx->idct_algo==FF_IDCT_INT){
2781             c->idct_put= ff_jref_idct_put;
2782             c->idct_add= ff_jref_idct_add;
2783             c->idct    = j_rev_dct;
2784             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2785         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2786                 avctx->idct_algo==FF_IDCT_VP3){
2787             c->idct_put= ff_vp3_idct_put_c;
2788             c->idct_add= ff_vp3_idct_add_c;
2789             c->idct    = ff_vp3_idct_c;
2790             c->idct_permutation_type= FF_NO_IDCT_PERM;
2791         }else if(avctx->idct_algo==FF_IDCT_WMV2){
2792             c->idct_put= ff_wmv2_idct_put_c;
2793             c->idct_add= ff_wmv2_idct_add_c;
2794             c->idct    = ff_wmv2_idct_c;
2795             c->idct_permutation_type= FF_NO_IDCT_PERM;
2796         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2797             c->idct_put= ff_faanidct_put;
2798             c->idct_add= ff_faanidct_add;
2799             c->idct    = ff_faanidct;
2800             c->idct_permutation_type= FF_NO_IDCT_PERM;
2801         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2802             c->idct_put= ff_ea_idct_put_c;
2803             c->idct_permutation_type= FF_NO_IDCT_PERM;
2804         }else{ //accurate/default
2805             c->idct_put = ff_simple_idct_put_8;
2806             c->idct_add = ff_simple_idct_add_8;
2807             c->idct     = ff_simple_idct_8;
2808             c->idct_permutation_type= FF_NO_IDCT_PERM;
2809         }
2810         }
2811     }
2812
2813     c->diff_pixels = diff_pixels_c;
2814     c->put_pixels_clamped = ff_put_pixels_clamped_c;
2815     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2816     c->add_pixels_clamped = ff_add_pixels_clamped_c;
2817     c->sum_abs_dctelem = sum_abs_dctelem_c;
2818     c->gmc1 = gmc1_c;
2819     c->gmc = ff_gmc_c;
2820     c->pix_sum = pix_sum_c;
2821     c->pix_norm1 = pix_norm1_c;
2822
2823     c->fill_block_tab[0] = fill_block16_c;
2824     c->fill_block_tab[1] = fill_block8_c;
2825
2826     /* TODO [0] 16  [1] 8 */
2827     c->pix_abs[0][0] = pix_abs16_c;
2828     c->pix_abs[0][1] = pix_abs16_x2_c;
2829     c->pix_abs[0][2] = pix_abs16_y2_c;
2830     c->pix_abs[0][3] = pix_abs16_xy2_c;
2831     c->pix_abs[1][0] = pix_abs8_c;
2832     c->pix_abs[1][1] = pix_abs8_x2_c;
2833     c->pix_abs[1][2] = pix_abs8_y2_c;
2834     c->pix_abs[1][3] = pix_abs8_xy2_c;
2835
2836     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2837     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2838     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2839     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2840     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2841     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2842     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2843     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2844     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2845
2846     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2847     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2848     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2849     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2850     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2851     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2852     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2853     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2854     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2855
2856 #define dspfunc(PFX, IDX, NUM) \
2857     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2858     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2859     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2860     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2861     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2862     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2863     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2864     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2865     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2866     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2867     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2868     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2869     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2870     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2871     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2872     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2873
2874     dspfunc(put_qpel, 0, 16);
2875     dspfunc(put_no_rnd_qpel, 0, 16);
2876
2877     dspfunc(avg_qpel, 0, 16);
2878     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2879
2880     dspfunc(put_qpel, 1, 8);
2881     dspfunc(put_no_rnd_qpel, 1, 8);
2882
2883     dspfunc(avg_qpel, 1, 8);
2884     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2885
2886 #undef dspfunc
2887
2888 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2889     ff_mlp_init(c, avctx);
2890 #endif
2891 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2892     ff_intrax8dsp_init(c,avctx);
2893 #endif
2894
2895     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2896     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2897     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2898     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2899     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2900     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2901     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2902     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2903
2904 #define SET_CMP_FUNC(name) \
2905     c->name[0]= name ## 16_c;\
2906     c->name[1]= name ## 8x8_c;
2907
2908     SET_CMP_FUNC(hadamard8_diff)
2909     c->hadamard8_diff[4]= hadamard8_intra16_c;
2910     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2911     SET_CMP_FUNC(dct_sad)
2912     SET_CMP_FUNC(dct_max)
2913 #if CONFIG_GPL
2914     SET_CMP_FUNC(dct264_sad)
2915 #endif
2916     c->sad[0]= pix_abs16_c;
2917     c->sad[1]= pix_abs8_c;
2918     c->sse[0]= sse16_c;
2919     c->sse[1]= sse8_c;
2920     c->sse[2]= sse4_c;
2921     SET_CMP_FUNC(quant_psnr)
2922     SET_CMP_FUNC(rd)
2923     SET_CMP_FUNC(bit)
2924     c->vsad[0]= vsad16_c;
2925     c->vsad[4]= vsad_intra16_c;
2926     c->vsad[5]= vsad_intra8_c;
2927     c->vsse[0]= vsse16_c;
2928     c->vsse[4]= vsse_intra16_c;
2929     c->vsse[5]= vsse_intra8_c;
2930     c->nsse[0]= nsse16_c;
2931     c->nsse[1]= nsse8_c;
2932 #if CONFIG_DWT
2933     ff_dsputil_init_dwt(c);
2934 #endif
2935
2936     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2937
2938     c->add_bytes= add_bytes_c;
2939     c->diff_bytes= diff_bytes_c;
2940     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2941     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2942     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
2943     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2944     c->bswap_buf= bswap_buf;
2945     c->bswap16_buf = bswap16_buf;
2946
2947     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2948         c->h263_h_loop_filter= h263_h_loop_filter_c;
2949         c->h263_v_loop_filter= h263_v_loop_filter_c;
2950     }
2951
2952     if (CONFIG_VP3_DECODER) {
2953         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
2954         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
2955         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
2956     }
2957
2958     c->h261_loop_filter= h261_loop_filter_c;
2959
2960     c->try_8x8basis= try_8x8basis_c;
2961     c->add_8x8basis= add_8x8basis_c;
2962
2963 #if CONFIG_VORBIS_DECODER
2964     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
2965 #endif
2966 #if CONFIG_AC3_DECODER
2967     c->ac3_downmix = ff_ac3_downmix_c;
2968 #endif
2969     c->vector_fmul = vector_fmul_c;
2970     c->vector_fmul_reverse = vector_fmul_reverse_c;
2971     c->vector_fmul_add = vector_fmul_add_c;
2972     c->vector_fmul_window = vector_fmul_window_c;
2973     c->vector_clipf = vector_clipf_c;
2974     c->scalarproduct_int16 = scalarproduct_int16_c;
2975     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2976     c->apply_window_int16 = apply_window_int16_c;
2977     c->vector_clip_int32 = vector_clip_int32_c;
2978     c->scalarproduct_float = scalarproduct_float_c;
2979     c->butterflies_float = butterflies_float_c;
2980     c->vector_fmul_scalar = vector_fmul_scalar_c;
2981
2982     c->shrink[0]= av_image_copy_plane;
2983     c->shrink[1]= ff_shrink22;
2984     c->shrink[2]= ff_shrink44;
2985     c->shrink[3]= ff_shrink88;
2986
2987     c->prefetch= just_return;
2988
2989     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
2990     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
2991
2992 #undef FUNC
2993 #undef FUNCC
2994 #define FUNC(f, depth) f ## _ ## depth
2995 #define FUNCC(f, depth) f ## _ ## depth ## _c
2996
2997 #define dspfunc1(PFX, IDX, NUM, depth)\
2998     c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
2999     c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3000     c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3001     c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3002
3003 #define dspfunc2(PFX, IDX, NUM, depth)\
3004     c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3005     c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3006     c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3007     c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3008     c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3009     c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3010     c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3011     c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3012     c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3013     c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3014     c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3015     c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3016     c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3017     c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3018     c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3019     c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3020
3021
3022 #define BIT_DEPTH_FUNCS(depth, dct)\
3023     c->get_pixels                    = FUNCC(get_pixels   ## dct   , depth);\
3024     c->draw_edges                    = FUNCC(draw_edges            , depth);\
3025     c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
3026     c->clear_block                   = FUNCC(clear_block  ## dct   , depth);\
3027     c->clear_blocks                  = FUNCC(clear_blocks ## dct   , depth);\
3028     c->add_pixels8                   = FUNCC(add_pixels8  ## dct   , depth);\
3029     c->add_pixels4                   = FUNCC(add_pixels4  ## dct   , depth);\
3030     c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
3031     c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3032 \
3033     c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
3034     c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
3035     c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
3036     c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
3037     c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
3038     c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
3039 \
3040     dspfunc1(put       , 0, 16, depth);\
3041     dspfunc1(put       , 1,  8, depth);\
3042     dspfunc1(put       , 2,  4, depth);\
3043     dspfunc1(put       , 3,  2, depth);\
3044     dspfunc1(put_no_rnd, 0, 16, depth);\
3045     dspfunc1(put_no_rnd, 1,  8, depth);\
3046     dspfunc1(avg       , 0, 16, depth);\
3047     dspfunc1(avg       , 1,  8, depth);\
3048     dspfunc1(avg       , 2,  4, depth);\
3049     dspfunc1(avg       , 3,  2, depth);\
3050     dspfunc1(avg_no_rnd, 0, 16, depth);\
3051     dspfunc1(avg_no_rnd, 1,  8, depth);\
3052 \
3053     dspfunc2(put_h264_qpel, 0, 16, depth);\
3054     dspfunc2(put_h264_qpel, 1,  8, depth);\
3055     dspfunc2(put_h264_qpel, 2,  4, depth);\
3056     dspfunc2(put_h264_qpel, 3,  2, depth);\
3057     dspfunc2(avg_h264_qpel, 0, 16, depth);\
3058     dspfunc2(avg_h264_qpel, 1,  8, depth);\
3059     dspfunc2(avg_h264_qpel, 2,  4, depth);
3060
3061     switch (avctx->bits_per_raw_sample) {
3062     case 9:
3063         if (c->dct_bits == 32) {
3064             BIT_DEPTH_FUNCS(9, _32);
3065         } else {
3066             BIT_DEPTH_FUNCS(9, _16);
3067         }
3068         break;
3069     case 10:
3070         if (c->dct_bits == 32) {
3071             BIT_DEPTH_FUNCS(10, _32);
3072         } else {
3073             BIT_DEPTH_FUNCS(10, _16);
3074         }
3075         break;
3076     default:
3077         av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3078     case 8:
3079         BIT_DEPTH_FUNCS(8, _16);
3080         break;
3081     }
3082
3083
3084     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
3085     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
3086     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
3087     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
3088     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
3089     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
3090     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
3091     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
3092     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
3093
3094     for(i=0; i<64; i++){
3095         if(!c->put_2tap_qpel_pixels_tab[0][i])
3096             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3097         if(!c->avg_2tap_qpel_pixels_tab[0][i])
3098             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3099     }
3100
3101     switch(c->idct_permutation_type){
3102     case FF_NO_IDCT_PERM:
3103         for(i=0; i<64; i++)
3104             c->idct_permutation[i]= i;
3105         break;
3106     case FF_LIBMPEG2_IDCT_PERM:
3107         for(i=0; i<64; i++)
3108             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3109         break;
3110     case FF_SIMPLE_IDCT_PERM:
3111         for(i=0; i<64; i++)
3112             c->idct_permutation[i]= simple_mmx_permutation[i];
3113         break;
3114     case FF_TRANSPOSE_IDCT_PERM:
3115         for(i=0; i<64; i++)
3116             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3117         break;
3118     case FF_PARTTRANS_IDCT_PERM:
3119         for(i=0; i<64; i++)
3120             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3121         break;
3122     case FF_SSE2_IDCT_PERM:
3123         for(i=0; i<64; i++)
3124             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3125         break;
3126     default:
3127         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3128     }
3129 }