git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/attributes.h"
  31 #include "libavutil/imgutils.h"
  32 #include "libavutil/internal.h"
  33 #include "avcodec.h"
  34 #include "copy_block.h"
  35 #include "dct.h"
  36 #include "dsputil.h"
  37 #include "simple_idct.h"
  38 #include "faandct.h"
  39 #include "faanidct.h"
  40 #include "imgconvert.h"
  41 #include "mathops.h"
  42 #include "mpegvideo.h"
  43 #include "config.h"
  44
  45 uint32_t ff_squareTbl[512] = {0, };
  46
  47 #define BIT_DEPTH 16
  48 #include "dsputil_template.c"
  49 #undef BIT_DEPTH
  50
  51 #define BIT_DEPTH 8
  52 #include "dsputil_template.c"
  53
  54 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  55 #define pb_7f (~0UL/255 * 0x7f)
  56 #define pb_80 (~0UL/255 * 0x80)
  57
  58 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  59    specification, we interleave the fields */
  60 const uint8_t ff_zigzag248_direct[64] = {
  61      0,  8,  1,  9, 16, 24,  2, 10,
  62     17, 25, 32, 40, 48, 56, 33, 41,
  63     18, 26,  3, 11,  4, 12, 19, 27,
  64     34, 42, 49, 57, 50, 58, 35, 43,
  65     20, 28,  5, 13,  6, 14, 21, 29,
  66     36, 44, 51, 59, 52, 60, 37, 45,
  67     22, 30,  7, 15, 23, 31, 38, 46,
  68     53, 61, 54, 62, 39, 47, 55, 63,
  69 };
  70
  71 const uint8_t ff_alternate_horizontal_scan[64] = {
  72     0,  1,   2,  3,  8,  9, 16, 17,
  73     10, 11,  4,  5,  6,  7, 15, 14,
  74     13, 12, 19, 18, 24, 25, 32, 33,
  75     26, 27, 20, 21, 22, 23, 28, 29,
  76     30, 31, 34, 35, 40, 41, 48, 49,
  77     42, 43, 36, 37, 38, 39, 44, 45,
  78     46, 47, 50, 51, 56, 57, 58, 59,
  79     52, 53, 54, 55, 60, 61, 62, 63,
  80 };
  81
  82 const uint8_t ff_alternate_vertical_scan[64] = {
  83     0,  8,  16, 24,  1,  9,  2, 10,
  84     17, 25, 32, 40, 48, 56, 57, 49,
  85     41, 33, 26, 18,  3, 11,  4, 12,
  86     19, 27, 34, 42, 50, 58, 35, 43,
  87     51, 59, 20, 28,  5, 13,  6, 14,
  88     21, 29, 36, 44, 52, 60, 37, 45,
  89     53, 61, 22, 30,  7, 15, 23, 31,
  90     38, 46, 54, 62, 39, 47, 55, 63,
  91 };
  92
  93 /* Input permutation for the simple_idct_mmx */
  94 static const uint8_t simple_mmx_permutation[64]={
  95         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
  96         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
  97         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
  98         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
  99         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 100         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 101         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 102         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 103 };
 104
 105 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 106
 107 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
 108                                const uint8_t *src_scantable)
 109 {
 110     int i;
 111     int end;
 112
 113     st->scantable= src_scantable;
 114
 115     for(i=0; i<64; i++){
 116         int j;
 117         j = src_scantable[i];
 118         st->permutated[i] = permutation[j];
 119     }
 120
 121     end=-1;
 122     for(i=0; i<64; i++){
 123         int j;
 124         j = st->permutated[i];
 125         if(j>end) end=j;
 126         st->raster_end[i]= end;
 127     }
 128 }
 129
 130 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
 131                                            int idct_permutation_type)
 132 {
 133     int i;
 134
 135     switch(idct_permutation_type){
 136     case FF_NO_IDCT_PERM:
 137         for(i=0; i<64; i++)
 138             idct_permutation[i]= i;
 139         break;
 140     case FF_LIBMPEG2_IDCT_PERM:
 141         for(i=0; i<64; i++)
 142             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 143         break;
 144     case FF_SIMPLE_IDCT_PERM:
 145         for(i=0; i<64; i++)
 146             idct_permutation[i]= simple_mmx_permutation[i];
 147         break;
 148     case FF_TRANSPOSE_IDCT_PERM:
 149         for(i=0; i<64; i++)
 150             idct_permutation[i]= ((i&7)<<3) | (i>>3);
 151         break;
 152     case FF_PARTTRANS_IDCT_PERM:
 153         for(i=0; i<64; i++)
 154             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
 155         break;
 156     case FF_SSE2_IDCT_PERM:
 157         for(i=0; i<64; i++)
 158             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
 159         break;
 160     default:
 161         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
 162     }
 163 }
 164
 165 static int pix_sum_c(uint8_t * pix, int line_size)
 166 {
 167     int s, i, j;
 168
 169     s = 0;
 170     for (i = 0; i < 16; i++) {
 171         for (j = 0; j < 16; j += 8) {
 172             s += pix[0];
 173             s += pix[1];
 174             s += pix[2];
 175             s += pix[3];
 176             s += pix[4];
 177             s += pix[5];
 178             s += pix[6];
 179             s += pix[7];
 180             pix += 8;
 181         }
 182         pix += line_size - 16;
 183     }
 184     return s;
 185 }
 186
 187 static int pix_norm1_c(uint8_t * pix, int line_size)
 188 {
 189     int s, i, j;
 190     uint32_t *sq = ff_squareTbl + 256;
 191
 192     s = 0;
 193     for (i = 0; i < 16; i++) {
 194         for (j = 0; j < 16; j += 8) {
 195 #if 0
 196             s += sq[pix[0]];
 197             s += sq[pix[1]];
 198             s += sq[pix[2]];
 199             s += sq[pix[3]];
 200             s += sq[pix[4]];
 201             s += sq[pix[5]];
 202             s += sq[pix[6]];
 203             s += sq[pix[7]];
 204 #else
 205 #if HAVE_FAST_64BIT
 206             register uint64_t x=*(uint64_t*)pix;
 207             s += sq[x&0xff];
 208             s += sq[(x>>8)&0xff];
 209             s += sq[(x>>16)&0xff];
 210             s += sq[(x>>24)&0xff];
 211             s += sq[(x>>32)&0xff];
 212             s += sq[(x>>40)&0xff];
 213             s += sq[(x>>48)&0xff];
 214             s += sq[(x>>56)&0xff];
 215 #else
 216             register uint32_t x=*(uint32_t*)pix;
 217             s += sq[x&0xff];
 218             s += sq[(x>>8)&0xff];
 219             s += sq[(x>>16)&0xff];
 220             s += sq[(x>>24)&0xff];
 221             x=*(uint32_t*)(pix+4);
 222             s += sq[x&0xff];
 223             s += sq[(x>>8)&0xff];
 224             s += sq[(x>>16)&0xff];
 225             s += sq[(x>>24)&0xff];
 226 #endif
 227 #endif
 228             pix += 8;
 229         }
 230         pix += line_size - 16;
 231     }
 232     return s;
 233 }
 234
 235 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 236     int i;
 237
 238     for(i=0; i+8<=w; i+=8){
 239         dst[i+0]= av_bswap32(src[i+0]);
 240         dst[i+1]= av_bswap32(src[i+1]);
 241         dst[i+2]= av_bswap32(src[i+2]);
 242         dst[i+3]= av_bswap32(src[i+3]);
 243         dst[i+4]= av_bswap32(src[i+4]);
 244         dst[i+5]= av_bswap32(src[i+5]);
 245         dst[i+6]= av_bswap32(src[i+6]);
 246         dst[i+7]= av_bswap32(src[i+7]);
 247     }
 248     for(;i<w; i++){
 249         dst[i+0]= av_bswap32(src[i+0]);
 250     }
 251 }
 252
 253 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 254 {
 255     while (len--)
 256         *dst++ = av_bswap16(*src++);
 257 }
 258
 259 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 260 {
 261     int s, i;
 262     uint32_t *sq = ff_squareTbl + 256;
 263
 264     s = 0;
 265     for (i = 0; i < h; i++) {
 266         s += sq[pix1[0] - pix2[0]];
 267         s += sq[pix1[1] - pix2[1]];
 268         s += sq[pix1[2] - pix2[2]];
 269         s += sq[pix1[3] - pix2[3]];
 270         pix1 += line_size;
 271         pix2 += line_size;
 272     }
 273     return s;
 274 }
 275
 276 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 277 {
 278     int s, i;
 279     uint32_t *sq = ff_squareTbl + 256;
 280
 281     s = 0;
 282     for (i = 0; i < h; i++) {
 283         s += sq[pix1[0] - pix2[0]];
 284         s += sq[pix1[1] - pix2[1]];
 285         s += sq[pix1[2] - pix2[2]];
 286         s += sq[pix1[3] - pix2[3]];
 287         s += sq[pix1[4] - pix2[4]];
 288         s += sq[pix1[5] - pix2[5]];
 289         s += sq[pix1[6] - pix2[6]];
 290         s += sq[pix1[7] - pix2[7]];
 291         pix1 += line_size;
 292         pix2 += line_size;
 293     }
 294     return s;
 295 }
 296
 297 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 298 {
 299     int s, i;
 300     uint32_t *sq = ff_squareTbl + 256;
 301
 302     s = 0;
 303     for (i = 0; i < h; i++) {
 304         s += sq[pix1[ 0] - pix2[ 0]];
 305         s += sq[pix1[ 1] - pix2[ 1]];
 306         s += sq[pix1[ 2] - pix2[ 2]];
 307         s += sq[pix1[ 3] - pix2[ 3]];
 308         s += sq[pix1[ 4] - pix2[ 4]];
 309         s += sq[pix1[ 5] - pix2[ 5]];
 310         s += sq[pix1[ 6] - pix2[ 6]];
 311         s += sq[pix1[ 7] - pix2[ 7]];
 312         s += sq[pix1[ 8] - pix2[ 8]];
 313         s += sq[pix1[ 9] - pix2[ 9]];
 314         s += sq[pix1[10] - pix2[10]];
 315         s += sq[pix1[11] - pix2[11]];
 316         s += sq[pix1[12] - pix2[12]];
 317         s += sq[pix1[13] - pix2[13]];
 318         s += sq[pix1[14] - pix2[14]];
 319         s += sq[pix1[15] - pix2[15]];
 320
 321         pix1 += line_size;
 322         pix2 += line_size;
 323     }
 324     return s;
 325 }
 326
 327 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
 328                           const uint8_t *s2, int stride){
 329     int i;
 330
 331     /* read the pixels */
 332     for(i=0;i<8;i++) {
 333         block[0] = s1[0] - s2[0];
 334         block[1] = s1[1] - s2[1];
 335         block[2] = s1[2] - s2[2];
 336         block[3] = s1[3] - s2[3];
 337         block[4] = s1[4] - s2[4];
 338         block[5] = s1[5] - s2[5];
 339         block[6] = s1[6] - s2[6];
 340         block[7] = s1[7] - s2[7];
 341         s1 += stride;
 342         s2 += stride;
 343         block += 8;
 344     }
 345 }
 346
 347
 348 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 349                                  int line_size)
 350 {
 351     int i;
 352
 353     /* read the pixels */
 354     for(i=0;i<8;i++) {
 355         pixels[0] = av_clip_uint8(block[0]);
 356         pixels[1] = av_clip_uint8(block[1]);
 357         pixels[2] = av_clip_uint8(block[2]);
 358         pixels[3] = av_clip_uint8(block[3]);
 359         pixels[4] = av_clip_uint8(block[4]);
 360         pixels[5] = av_clip_uint8(block[5]);
 361         pixels[6] = av_clip_uint8(block[6]);
 362         pixels[7] = av_clip_uint8(block[7]);
 363
 364         pixels += line_size;
 365         block += 8;
 366     }
 367 }
 368
 369 static void put_signed_pixels_clamped_c(const int16_t *block,
 370                                         uint8_t *restrict pixels,
 371                                         int line_size)
 372 {
 373     int i, j;
 374
 375     for (i = 0; i < 8; i++) {
 376         for (j = 0; j < 8; j++) {
 377             if (*block < -128)
 378                 *pixels = 0;
 379             else if (*block > 127)
 380                 *pixels = 255;
 381             else
 382                 *pixels = (uint8_t)(*block + 128);
 383             block++;
 384             pixels++;
 385         }
 386         pixels += (line_size - 8);
 387     }
 388 }
 389
 390 static void add_pixels8_c(uint8_t *restrict pixels,
 391                           int16_t *block,
 392                           int line_size)
 393 {
 394     int i;
 395
 396     for(i=0;i<8;i++) {
 397         pixels[0] += block[0];
 398         pixels[1] += block[1];
 399         pixels[2] += block[2];
 400         pixels[3] += block[3];
 401         pixels[4] += block[4];
 402         pixels[5] += block[5];
 403         pixels[6] += block[6];
 404         pixels[7] += block[7];
 405         pixels += line_size;
 406         block += 8;
 407     }
 408 }
 409
 410 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 411                                  int line_size)
 412 {
 413     int i;
 414
 415     /* read the pixels */
 416     for(i=0;i<8;i++) {
 417         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 418         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 419         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 420         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 421         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
 422         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
 423         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
 424         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
 425         pixels += line_size;
 426         block += 8;
 427     }
 428 }
 429
 430 static int sum_abs_dctelem_c(int16_t *block)
 431 {
 432     int sum=0, i;
 433     for(i=0; i<64; i++)
 434         sum+= FFABS(block[i]);
 435     return sum;
 436 }
 437
 438 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 439 {
 440     int i;
 441
 442     for (i = 0; i < h; i++) {
 443         memset(block, value, 16);
 444         block += line_size;
 445     }
 446 }
 447
 448 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 449 {
 450     int i;
 451
 452     for (i = 0; i < h; i++) {
 453         memset(block, value, 8);
 454         block += line_size;
 455     }
 456 }
 457
 458 #define avg2(a,b) ((a+b+1)>>1)
 459 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 460
 461 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 462 {
 463     const int A=(16-x16)*(16-y16);
 464     const int B=(   x16)*(16-y16);
 465     const int C=(16-x16)*(   y16);
 466     const int D=(   x16)*(   y16);
 467     int i;
 468
 469     for(i=0; i<h; i++)
 470     {
 471         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 472         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 473         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 474         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 475         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 476         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 477         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 478         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 479         dst+= stride;
 480         src+= stride;
 481     }
 482 }
 483
 484 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 485                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 486 {
 487     int y, vx, vy;
 488     const int s= 1<<shift;
 489
 490     width--;
 491     height--;
 492
 493     for(y=0; y<h; y++){
 494         int x;
 495
 496         vx= ox;
 497         vy= oy;
 498         for(x=0; x<8; x++){ //XXX FIXME optimize
 499             int src_x, src_y, frac_x, frac_y, index;
 500
 501             src_x= vx>>16;
 502             src_y= vy>>16;
 503             frac_x= src_x&(s-1);
 504             frac_y= src_y&(s-1);
 505             src_x>>=shift;
 506             src_y>>=shift;
 507
 508             if((unsigned)src_x < width){
 509                 if((unsigned)src_y < height){
 510                     index= src_x + src_y*stride;
 511                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 512                                            + src[index       +1]*   frac_x )*(s-frac_y)
 513                                         + (  src[index+stride  ]*(s-frac_x)
 514                                            + src[index+stride+1]*   frac_x )*   frac_y
 515                                         + r)>>(shift*2);
 516                 }else{
 517                     index= src_x + av_clip(src_y, 0, height)*stride;
 518                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 519                                           + src[index       +1]*   frac_x )*s
 520                                         + r)>>(shift*2);
 521                 }
 522             }else{
 523                 if((unsigned)src_y < height){
 524                     index= av_clip(src_x, 0, width) + src_y*stride;
 525                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 526                                            + src[index+stride  ]*   frac_y )*s
 527                                         + r)>>(shift*2);
 528                 }else{
 529                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 530                     dst[y*stride + x]=    src[index         ];
 531                 }
 532             }
 533
 534             vx+= dxx;
 535             vy+= dyx;
 536         }
 537         ox += dxy;
 538         oy += dyy;
 539     }
 540 }
 541
 542 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 543     switch(width){
 544     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 545     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 546     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 547     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 548     }
 549 }
 550
 551 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 552     int i,j;
 553     for (i=0; i < height; i++) {
 554       for (j=0; j < width; j++) {
 555         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 556       }
 557       src += stride;
 558       dst += stride;
 559     }
 560 }
 561
 562 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 563     int i,j;
 564     for (i=0; i < height; i++) {
 565       for (j=0; j < width; j++) {
 566         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 567       }
 568       src += stride;
 569       dst += stride;
 570     }
 571 }
 572
 573 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 574     int i,j;
 575     for (i=0; i < height; i++) {
 576       for (j=0; j < width; j++) {
 577         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 578       }
 579       src += stride;
 580       dst += stride;
 581     }
 582 }
 583
 584 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 585     int i,j;
 586     for (i=0; i < height; i++) {
 587       for (j=0; j < width; j++) {
 588         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 589       }
 590       src += stride;
 591       dst += stride;
 592     }
 593 }
 594
 595 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 596     int i,j;
 597     for (i=0; i < height; i++) {
 598       for (j=0; j < width; j++) {
 599         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 600       }
 601       src += stride;
 602       dst += stride;
 603     }
 604 }
 605
 606 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 607     int i,j;
 608     for (i=0; i < height; i++) {
 609       for (j=0; j < width; j++) {
 610         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 611       }
 612       src += stride;
 613       dst += stride;
 614     }
 615 }
 616
 617 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 618     int i,j;
 619     for (i=0; i < height; i++) {
 620       for (j=0; j < width; j++) {
 621         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 622       }
 623       src += stride;
 624       dst += stride;
 625     }
 626 }
 627
 628 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 629     int i,j;
 630     for (i=0; i < height; i++) {
 631       for (j=0; j < width; j++) {
 632         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 633       }
 634       src += stride;
 635       dst += stride;
 636     }
 637 }
 638
 639 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 640     switch(width){
 641     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 642     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 643     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 644     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 645     }
 646 }
 647
 648 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 649     int i,j;
 650     for (i=0; i < height; i++) {
 651       for (j=0; j < width; j++) {
 652         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 653       }
 654       src += stride;
 655       dst += stride;
 656     }
 657 }
 658
 659 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 660     int i,j;
 661     for (i=0; i < height; i++) {
 662       for (j=0; j < width; j++) {
 663         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 664       }
 665       src += stride;
 666       dst += stride;
 667     }
 668 }
 669
 670 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 671     int i,j;
 672     for (i=0; i < height; i++) {
 673       for (j=0; j < width; j++) {
 674         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 675       }
 676       src += stride;
 677       dst += stride;
 678     }
 679 }
 680
 681 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 682     int i,j;
 683     for (i=0; i < height; i++) {
 684       for (j=0; j < width; j++) {
 685         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 686       }
 687       src += stride;
 688       dst += stride;
 689     }
 690 }
 691
 692 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 693     int i,j;
 694     for (i=0; i < height; i++) {
 695       for (j=0; j < width; j++) {
 696         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 697       }
 698       src += stride;
 699       dst += stride;
 700     }
 701 }
 702
 703 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 704     int i,j;
 705     for (i=0; i < height; i++) {
 706       for (j=0; j < width; j++) {
 707         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 708       }
 709       src += stride;
 710       dst += stride;
 711     }
 712 }
 713
 714 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 715     int i,j;
 716     for (i=0; i < height; i++) {
 717       for (j=0; j < width; j++) {
 718         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 719       }
 720       src += stride;
 721       dst += stride;
 722     }
 723 }
 724
 725 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 726     int i,j;
 727     for (i=0; i < height; i++) {
 728       for (j=0; j < width; j++) {
 729         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 730       }
 731       src += stride;
 732       dst += stride;
 733     }
 734 }
 735
 736 #define QPEL_MC(r, OPNAME, RND, OP) \
 737 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 738     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 739     int i;\
 740     for(i=0; i<h; i++)\
 741     {\
 742         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 743         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 744         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 745         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 746         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 747         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 748         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 749         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 750         dst+=dstStride;\
 751         src+=srcStride;\
 752     }\
 753 }\
 754 \
 755 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 756     const int w=8;\
 757     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 758     int i;\
 759     for(i=0; i<w; i++)\
 760     {\
 761         const int src0= src[0*srcStride];\
 762         const int src1= src[1*srcStride];\
 763         const int src2= src[2*srcStride];\
 764         const int src3= src[3*srcStride];\
 765         const int src4= src[4*srcStride];\
 766         const int src5= src[5*srcStride];\
 767         const int src6= src[6*srcStride];\
 768         const int src7= src[7*srcStride];\
 769         const int src8= src[8*srcStride];\
 770         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 771         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 772         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 773         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 774         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 775         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 776         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 777         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 778         dst++;\
 779         src++;\
 780     }\
 781 }\
 782 \
 783 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 784     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 785     int i;\
 786     \
 787     for(i=0; i<h; i++)\
 788     {\
 789         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 790         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 791         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 792         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 793         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 794         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 795         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 796         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 797         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 798         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 799         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 800         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 801         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 802         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 803         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 804         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 805         dst+=dstStride;\
 806         src+=srcStride;\
 807     }\
 808 }\
 809 \
 810 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 811     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 812     int i;\
 813     const int w=16;\
 814     for(i=0; i<w; i++)\
 815     {\
 816         const int src0= src[0*srcStride];\
 817         const int src1= src[1*srcStride];\
 818         const int src2= src[2*srcStride];\
 819         const int src3= src[3*srcStride];\
 820         const int src4= src[4*srcStride];\
 821         const int src5= src[5*srcStride];\
 822         const int src6= src[6*srcStride];\
 823         const int src7= src[7*srcStride];\
 824         const int src8= src[8*srcStride];\
 825         const int src9= src[9*srcStride];\
 826         const int src10= src[10*srcStride];\
 827         const int src11= src[11*srcStride];\
 828         const int src12= src[12*srcStride];\
 829         const int src13= src[13*srcStride];\
 830         const int src14= src[14*srcStride];\
 831         const int src15= src[15*srcStride];\
 832         const int src16= src[16*srcStride];\
 833         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 834         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 835         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 836         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 837         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 838         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 839         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 840         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 841         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 842         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 843         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 844         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 845         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 846         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 847         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 848         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 849         dst++;\
 850         src++;\
 851     }\
 852 }\
 853 \
 854 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 855 {\
 856     uint8_t half[64];\
 857     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 858     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 859 }\
 860 \
 861 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 862 {\
 863     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 864 }\
 865 \
 866 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 867 {\
 868     uint8_t half[64];\
 869     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 870     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 871 }\
 872 \
 873 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 874 {\
 875     uint8_t full[16*9];\
 876     uint8_t half[64];\
 877     copy_block9(full, src, 16, stride, 9);\
 878     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 879     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 880 }\
 881 \
 882 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 883 {\
 884     uint8_t full[16*9];\
 885     copy_block9(full, src, 16, stride, 9);\
 886     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 887 }\
 888 \
 889 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 890 {\
 891     uint8_t full[16*9];\
 892     uint8_t half[64];\
 893     copy_block9(full, src, 16, stride, 9);\
 894     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 895     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 896 }\
 897 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 898 {\
 899     uint8_t full[16*9];\
 900     uint8_t halfH[72];\
 901     uint8_t halfV[64];\
 902     uint8_t halfHV[64];\
 903     copy_block9(full, src, 16, stride, 9);\
 904     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 905     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 906     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 907     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 908 }\
 909 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 910 {\
 911     uint8_t full[16*9];\
 912     uint8_t halfH[72];\
 913     uint8_t halfHV[64];\
 914     copy_block9(full, src, 16, stride, 9);\
 915     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 916     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 917     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 918     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 919 }\
 920 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 921 {\
 922     uint8_t full[16*9];\
 923     uint8_t halfH[72];\
 924     uint8_t halfV[64];\
 925     uint8_t halfHV[64];\
 926     copy_block9(full, src, 16, stride, 9);\
 927     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 928     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 929     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 930     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 931 }\
 932 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 933 {\
 934     uint8_t full[16*9];\
 935     uint8_t halfH[72];\
 936     uint8_t halfHV[64];\
 937     copy_block9(full, src, 16, stride, 9);\
 938     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 939     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 940     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 941     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 942 }\
 943 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 944 {\
 945     uint8_t full[16*9];\
 946     uint8_t halfH[72];\
 947     uint8_t halfV[64];\
 948     uint8_t halfHV[64];\
 949     copy_block9(full, src, 16, stride, 9);\
 950     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 951     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 952     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 953     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 954 }\
 955 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 956 {\
 957     uint8_t full[16*9];\
 958     uint8_t halfH[72];\
 959     uint8_t halfHV[64];\
 960     copy_block9(full, src, 16, stride, 9);\
 961     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 962     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 963     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 964     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 965 }\
 966 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 967 {\
 968     uint8_t full[16*9];\
 969     uint8_t halfH[72];\
 970     uint8_t halfV[64];\
 971     uint8_t halfHV[64];\
 972     copy_block9(full, src, 16, stride, 9);\
 973     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
 974     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 975     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 976     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 977 }\
 978 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 979 {\
 980     uint8_t full[16*9];\
 981     uint8_t halfH[72];\
 982     uint8_t halfHV[64];\
 983     copy_block9(full, src, 16, stride, 9);\
 984     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 985     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 986     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 987     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 988 }\
 989 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 990 {\
 991     uint8_t halfH[72];\
 992     uint8_t halfHV[64];\
 993     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
 994     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 995     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 996 }\
 997 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 998 {\
 999     uint8_t halfH[72];\
1000     uint8_t halfHV[64];\
1001     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1002     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1003     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1004 }\
1005 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1006 {\
1007     uint8_t full[16*9];\
1008     uint8_t halfH[72];\
1009     uint8_t halfV[64];\
1010     uint8_t halfHV[64];\
1011     copy_block9(full, src, 16, stride, 9);\
1012     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1013     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1014     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1015     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1016 }\
1017 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1018 {\
1019     uint8_t full[16*9];\
1020     uint8_t halfH[72];\
1021     copy_block9(full, src, 16, stride, 9);\
1022     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1023     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1024     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1025 }\
1026 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1027 {\
1028     uint8_t full[16*9];\
1029     uint8_t halfH[72];\
1030     uint8_t halfV[64];\
1031     uint8_t halfHV[64];\
1032     copy_block9(full, src, 16, stride, 9);\
1033     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1034     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1035     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1036     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1037 }\
1038 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1039 {\
1040     uint8_t full[16*9];\
1041     uint8_t halfH[72];\
1042     copy_block9(full, src, 16, stride, 9);\
1043     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1044     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1045     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1046 }\
1047 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1048 {\
1049     uint8_t halfH[72];\
1050     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1051     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1052 }\
1053 \
1054 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1055 {\
1056     uint8_t half[256];\
1057     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1058     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1059 }\
1060 \
1061 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1062 {\
1063     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1064 }\
1065 \
1066 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1067 {\
1068     uint8_t half[256];\
1069     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1070     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1071 }\
1072 \
1073 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1074 {\
1075     uint8_t full[24*17];\
1076     uint8_t half[256];\
1077     copy_block17(full, src, 24, stride, 17);\
1078     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1079     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1080 }\
1081 \
1082 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1083 {\
1084     uint8_t full[24*17];\
1085     copy_block17(full, src, 24, stride, 17);\
1086     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1087 }\
1088 \
1089 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1090 {\
1091     uint8_t full[24*17];\
1092     uint8_t half[256];\
1093     copy_block17(full, src, 24, stride, 17);\
1094     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1095     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1096 }\
1097 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1098 {\
1099     uint8_t full[24*17];\
1100     uint8_t halfH[272];\
1101     uint8_t halfV[256];\
1102     uint8_t halfHV[256];\
1103     copy_block17(full, src, 24, stride, 17);\
1104     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1105     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1106     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1107     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1108 }\
1109 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1110 {\
1111     uint8_t full[24*17];\
1112     uint8_t halfH[272];\
1113     uint8_t halfHV[256];\
1114     copy_block17(full, src, 24, stride, 17);\
1115     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1116     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1117     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1118     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1119 }\
1120 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1121 {\
1122     uint8_t full[24*17];\
1123     uint8_t halfH[272];\
1124     uint8_t halfV[256];\
1125     uint8_t halfHV[256];\
1126     copy_block17(full, src, 24, stride, 17);\
1127     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1128     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1129     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1130     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1131 }\
1132 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1133 {\
1134     uint8_t full[24*17];\
1135     uint8_t halfH[272];\
1136     uint8_t halfHV[256];\
1137     copy_block17(full, src, 24, stride, 17);\
1138     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1139     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1140     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1141     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1142 }\
1143 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1144 {\
1145     uint8_t full[24*17];\
1146     uint8_t halfH[272];\
1147     uint8_t halfV[256];\
1148     uint8_t halfHV[256];\
1149     copy_block17(full, src, 24, stride, 17);\
1150     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1151     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1152     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1153     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1154 }\
1155 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1156 {\
1157     uint8_t full[24*17];\
1158     uint8_t halfH[272];\
1159     uint8_t halfHV[256];\
1160     copy_block17(full, src, 24, stride, 17);\
1161     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1162     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1163     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1164     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1165 }\
1166 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1167 {\
1168     uint8_t full[24*17];\
1169     uint8_t halfH[272];\
1170     uint8_t halfV[256];\
1171     uint8_t halfHV[256];\
1172     copy_block17(full, src, 24, stride, 17);\
1173     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1174     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1175     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1176     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1177 }\
1178 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1179 {\
1180     uint8_t full[24*17];\
1181     uint8_t halfH[272];\
1182     uint8_t halfHV[256];\
1183     copy_block17(full, src, 24, stride, 17);\
1184     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1185     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1186     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1187     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1188 }\
1189 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1190 {\
1191     uint8_t halfH[272];\
1192     uint8_t halfHV[256];\
1193     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1194     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1195     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1196 }\
1197 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1198 {\
1199     uint8_t halfH[272];\
1200     uint8_t halfHV[256];\
1201     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1202     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1203     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1204 }\
1205 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1206 {\
1207     uint8_t full[24*17];\
1208     uint8_t halfH[272];\
1209     uint8_t halfV[256];\
1210     uint8_t halfHV[256];\
1211     copy_block17(full, src, 24, stride, 17);\
1212     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1213     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1214     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1215     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1216 }\
1217 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1218 {\
1219     uint8_t full[24*17];\
1220     uint8_t halfH[272];\
1221     copy_block17(full, src, 24, stride, 17);\
1222     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1223     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1224     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1225 }\
1226 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1227 {\
1228     uint8_t full[24*17];\
1229     uint8_t halfH[272];\
1230     uint8_t halfV[256];\
1231     uint8_t halfHV[256];\
1232     copy_block17(full, src, 24, stride, 17);\
1233     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1234     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1235     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1236     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1237 }\
1238 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1239 {\
1240     uint8_t full[24*17];\
1241     uint8_t halfH[272];\
1242     copy_block17(full, src, 24, stride, 17);\
1243     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1244     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1245     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1246 }\
1247 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1248 {\
1249     uint8_t halfH[272];\
1250     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1251     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1252 }
1253
1254 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1255 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1256 #define op_put(a, b) a = cm[((b) + 16)>>5]
1257 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1258
1259 QPEL_MC(0, put_       , _       , op_put)
1260 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1261 QPEL_MC(0, avg_       , _       , op_avg)
1262 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1263 #undef op_avg
1264 #undef op_avg_no_rnd
1265 #undef op_put
1266 #undef op_put_no_rnd
1267
1268 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1269 {
1270     put_pixels8_8_c(dst, src, stride, 8);
1271 }
1272 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1273 {
1274     avg_pixels8_8_c(dst, src, stride, 8);
1275 }
1276 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1277 {
1278     put_pixels16_8_c(dst, src, stride, 16);
1279 }
1280 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1281 {
1282     avg_pixels16_8_c(dst, src, stride, 16);
1283 }
1284
1285 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1286 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1287 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1288 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1289 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1290 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1291
1292 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1293     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1294     int i;
1295
1296     for(i=0; i<h; i++){
1297         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1298         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1299         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1300         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1301         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1302         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1303         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1304         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1305         dst+=dstStride;
1306         src+=srcStride;
1307     }
1308 }
1309
1310 #if CONFIG_RV40_DECODER
1311 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1312 {
1313     put_pixels16_xy2_8_c(dst, src, stride, 16);
1314 }
1315 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1316 {
1317     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1318 }
1319 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1320 {
1321     put_pixels8_xy2_8_c(dst, src, stride, 8);
1322 }
1323 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1324 {
1325     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1326 }
1327 #endif /* CONFIG_RV40_DECODER */
1328
1329 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1330     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1331     int i;
1332
1333     for(i=0; i<w; i++){
1334         const int src_1= src[ -srcStride];
1335         const int src0 = src[0          ];
1336         const int src1 = src[  srcStride];
1337         const int src2 = src[2*srcStride];
1338         const int src3 = src[3*srcStride];
1339         const int src4 = src[4*srcStride];
1340         const int src5 = src[5*srcStride];
1341         const int src6 = src[6*srcStride];
1342         const int src7 = src[7*srcStride];
1343         const int src8 = src[8*srcStride];
1344         const int src9 = src[9*srcStride];
1345         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1346         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1347         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1348         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1349         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1350         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1351         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1352         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1353         src++;
1354         dst++;
1355     }
1356 }
1357
1358 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1359 {
1360     uint8_t half[64];
1361     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1362     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1363 }
1364
1365 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1366 {
1367     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1368 }
1369
1370 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1371 {
1372     uint8_t half[64];
1373     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1374     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1375 }
1376
1377 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1378 {
1379     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1380 }
1381
1382 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1383 {
1384     uint8_t halfH[88];
1385     uint8_t halfV[64];
1386     uint8_t halfHV[64];
1387     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1388     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1389     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1390     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1391 }
1392 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1393 {
1394     uint8_t halfH[88];
1395     uint8_t halfV[64];
1396     uint8_t halfHV[64];
1397     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1398     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1399     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1400     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1401 }
1402 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1403 {
1404     uint8_t halfH[88];
1405     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1406     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1407 }
1408
1409 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1410 {
1411     int s, i;
1412
1413     s = 0;
1414     for(i=0;i<h;i++) {
1415         s += abs(pix1[0] - pix2[0]);
1416         s += abs(pix1[1] - pix2[1]);
1417         s += abs(pix1[2] - pix2[2]);
1418         s += abs(pix1[3] - pix2[3]);
1419         s += abs(pix1[4] - pix2[4]);
1420         s += abs(pix1[5] - pix2[5]);
1421         s += abs(pix1[6] - pix2[6]);
1422         s += abs(pix1[7] - pix2[7]);
1423         s += abs(pix1[8] - pix2[8]);
1424         s += abs(pix1[9] - pix2[9]);
1425         s += abs(pix1[10] - pix2[10]);
1426         s += abs(pix1[11] - pix2[11]);
1427         s += abs(pix1[12] - pix2[12]);
1428         s += abs(pix1[13] - pix2[13]);
1429         s += abs(pix1[14] - pix2[14]);
1430         s += abs(pix1[15] - pix2[15]);
1431         pix1 += line_size;
1432         pix2 += line_size;
1433     }
1434     return s;
1435 }
1436
1437 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1438 {
1439     int s, i;
1440
1441     s = 0;
1442     for(i=0;i<h;i++) {
1443         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1444         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1445         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1446         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1447         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1448         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1449         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1450         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1451         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1452         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1453         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1454         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1455         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1456         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1457         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1458         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1459         pix1 += line_size;
1460         pix2 += line_size;
1461     }
1462     return s;
1463 }
1464
1465 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1466 {
1467     int s, i;
1468     uint8_t *pix3 = pix2 + line_size;
1469
1470     s = 0;
1471     for(i=0;i<h;i++) {
1472         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1473         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1474         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1475         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1476         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1477         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1478         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1479         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1480         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1481         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1482         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1483         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1484         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1485         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1486         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1487         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1488         pix1 += line_size;
1489         pix2 += line_size;
1490         pix3 += line_size;
1491     }
1492     return s;
1493 }
1494
1495 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1496 {
1497     int s, i;
1498     uint8_t *pix3 = pix2 + line_size;
1499
1500     s = 0;
1501     for(i=0;i<h;i++) {
1502         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1503         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1504         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1505         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1506         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1507         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1508         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1509         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1510         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1511         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1512         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1513         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1514         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1515         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1516         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1517         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1518         pix1 += line_size;
1519         pix2 += line_size;
1520         pix3 += line_size;
1521     }
1522     return s;
1523 }
1524
1525 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1526 {
1527     int s, i;
1528
1529     s = 0;
1530     for(i=0;i<h;i++) {
1531         s += abs(pix1[0] - pix2[0]);
1532         s += abs(pix1[1] - pix2[1]);
1533         s += abs(pix1[2] - pix2[2]);
1534         s += abs(pix1[3] - pix2[3]);
1535         s += abs(pix1[4] - pix2[4]);
1536         s += abs(pix1[5] - pix2[5]);
1537         s += abs(pix1[6] - pix2[6]);
1538         s += abs(pix1[7] - pix2[7]);
1539         pix1 += line_size;
1540         pix2 += line_size;
1541     }
1542     return s;
1543 }
1544
1545 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1546 {
1547     int s, i;
1548
1549     s = 0;
1550     for(i=0;i<h;i++) {
1551         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1552         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1553         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1554         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1555         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1556         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1557         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1558         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1559         pix1 += line_size;
1560         pix2 += line_size;
1561     }
1562     return s;
1563 }
1564
1565 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1566 {
1567     int s, i;
1568     uint8_t *pix3 = pix2 + line_size;
1569
1570     s = 0;
1571     for(i=0;i<h;i++) {
1572         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1573         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1574         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1575         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1576         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1577         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1578         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1579         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1580         pix1 += line_size;
1581         pix2 += line_size;
1582         pix3 += line_size;
1583     }
1584     return s;
1585 }
1586
1587 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1588 {
1589     int s, i;
1590     uint8_t *pix3 = pix2 + line_size;
1591
1592     s = 0;
1593     for(i=0;i<h;i++) {
1594         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1595         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1596         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1597         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1598         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1599         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1600         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1601         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1602         pix1 += line_size;
1603         pix2 += line_size;
1604         pix3 += line_size;
1605     }
1606     return s;
1607 }
1608
1609 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1610     MpegEncContext *c = v;
1611     int score1=0;
1612     int score2=0;
1613     int x,y;
1614
1615     for(y=0; y<h; y++){
1616         for(x=0; x<16; x++){
1617             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1618         }
1619         if(y+1<h){
1620             for(x=0; x<15; x++){
1621                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1622                              - s1[x+1] + s1[x+1+stride])
1623                         -FFABS(  s2[x  ] - s2[x  +stride]
1624                              - s2[x+1] + s2[x+1+stride]);
1625             }
1626         }
1627         s1+= stride;
1628         s2+= stride;
1629     }
1630
1631     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1632     else  return score1 + FFABS(score2)*8;
1633 }
1634
1635 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1636     MpegEncContext *c = v;
1637     int score1=0;
1638     int score2=0;
1639     int x,y;
1640
1641     for(y=0; y<h; y++){
1642         for(x=0; x<8; x++){
1643             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1644         }
1645         if(y+1<h){
1646             for(x=0; x<7; x++){
1647                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1648                              - s1[x+1] + s1[x+1+stride])
1649                         -FFABS(  s2[x  ] - s2[x  +stride]
1650                              - s2[x+1] + s2[x+1+stride]);
1651             }
1652         }
1653         s1+= stride;
1654         s2+= stride;
1655     }
1656
1657     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1658     else  return score1 + FFABS(score2)*8;
1659 }
1660
1661 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1662     int i;
1663     unsigned int sum=0;
1664
1665     for(i=0; i<8*8; i++){
1666         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1667         int w= weight[i];
1668         b>>= RECON_SHIFT;
1669         assert(-512<b && b<512);
1670
1671         sum += (w*b)*(w*b)>>4;
1672     }
1673     return sum>>2;
1674 }
1675
1676 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1677     int i;
1678
1679     for(i=0; i<8*8; i++){
1680         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1681     }
1682 }
1683
1684 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1685     return 0;
1686 }
1687
1688 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1689     int i;
1690
1691     memset(cmp, 0, sizeof(void*)*6);
1692
1693     for(i=0; i<6; i++){
1694         switch(type&0xFF){
1695         case FF_CMP_SAD:
1696             cmp[i]= c->sad[i];
1697             break;
1698         case FF_CMP_SATD:
1699             cmp[i]= c->hadamard8_diff[i];
1700             break;
1701         case FF_CMP_SSE:
1702             cmp[i]= c->sse[i];
1703             break;
1704         case FF_CMP_DCT:
1705             cmp[i]= c->dct_sad[i];
1706             break;
1707         case FF_CMP_DCT264:
1708             cmp[i]= c->dct264_sad[i];
1709             break;
1710         case FF_CMP_DCTMAX:
1711             cmp[i]= c->dct_max[i];
1712             break;
1713         case FF_CMP_PSNR:
1714             cmp[i]= c->quant_psnr[i];
1715             break;
1716         case FF_CMP_BIT:
1717             cmp[i]= c->bit[i];
1718             break;
1719         case FF_CMP_RD:
1720             cmp[i]= c->rd[i];
1721             break;
1722         case FF_CMP_VSAD:
1723             cmp[i]= c->vsad[i];
1724             break;
1725         case FF_CMP_VSSE:
1726             cmp[i]= c->vsse[i];
1727             break;
1728         case FF_CMP_ZERO:
1729             cmp[i]= zero_cmp;
1730             break;
1731         case FF_CMP_NSSE:
1732             cmp[i]= c->nsse[i];
1733             break;
1734         default:
1735             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1736         }
1737     }
1738 }
1739
1740 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1741     long i;
1742     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1743         long a = *(long*)(src+i);
1744         long b = *(long*)(dst+i);
1745         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1746     }
1747     for(; i<w; i++)
1748         dst[i+0] += src[i+0];
1749 }
1750
1751 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1752     long i;
1753 #if !HAVE_FAST_UNALIGNED
1754     if((long)src2 & (sizeof(long)-1)){
1755         for(i=0; i+7<w; i+=8){
1756             dst[i+0] = src1[i+0]-src2[i+0];
1757             dst[i+1] = src1[i+1]-src2[i+1];
1758             dst[i+2] = src1[i+2]-src2[i+2];
1759             dst[i+3] = src1[i+3]-src2[i+3];
1760             dst[i+4] = src1[i+4]-src2[i+4];
1761             dst[i+5] = src1[i+5]-src2[i+5];
1762             dst[i+6] = src1[i+6]-src2[i+6];
1763             dst[i+7] = src1[i+7]-src2[i+7];
1764         }
1765     }else
1766 #endif
1767     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1768         long a = *(long*)(src1+i);
1769         long b = *(long*)(src2+i);
1770         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1771     }
1772     for(; i<w; i++)
1773         dst[i+0] = src1[i+0]-src2[i+0];
1774 }
1775
1776 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1777     int i;
1778     uint8_t l, lt;
1779
1780     l= *left;
1781     lt= *left_top;
1782
1783     for(i=0; i<w; i++){
1784         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1785         lt= src1[i];
1786         dst[i]= l;
1787     }
1788
1789     *left= l;
1790     *left_top= lt;
1791 }
1792
1793 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1794     int i;
1795     uint8_t l, lt;
1796
1797     l= *left;
1798     lt= *left_top;
1799
1800     for(i=0; i<w; i++){
1801         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1802         lt= src1[i];
1803         l= src2[i];
1804         dst[i]= l - pred;
1805     }
1806
1807     *left= l;
1808     *left_top= lt;
1809 }
1810
1811 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1812     int i;
1813
1814     for(i=0; i<w-1; i++){
1815         acc+= src[i];
1816         dst[i]= acc;
1817         i++;
1818         acc+= src[i];
1819         dst[i]= acc;
1820     }
1821
1822     for(; i<w; i++){
1823         acc+= src[i];
1824         dst[i]= acc;
1825     }
1826
1827     return acc;
1828 }
1829
1830 #if HAVE_BIGENDIAN
1831 #define B 3
1832 #define G 2
1833 #define R 1
1834 #define A 0
1835 #else
1836 #define B 0
1837 #define G 1
1838 #define R 2
1839 #define A 3
1840 #endif
1841 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1842     int i;
1843     int r,g,b,a;
1844     r= *red;
1845     g= *green;
1846     b= *blue;
1847     a= *alpha;
1848
1849     for(i=0; i<w; i++){
1850         b+= src[4*i+B];
1851         g+= src[4*i+G];
1852         r+= src[4*i+R];
1853         a+= src[4*i+A];
1854
1855         dst[4*i+B]= b;
1856         dst[4*i+G]= g;
1857         dst[4*i+R]= r;
1858         dst[4*i+A]= a;
1859     }
1860
1861     *red= r;
1862     *green= g;
1863     *blue= b;
1864     *alpha= a;
1865 }
1866 #undef B
1867 #undef G
1868 #undef R
1869 #undef A
1870
1871 #define BUTTERFLY2(o1,o2,i1,i2) \
1872 o1= (i1)+(i2);\
1873 o2= (i1)-(i2);
1874
1875 #define BUTTERFLY1(x,y) \
1876 {\
1877     int a,b;\
1878     a= x;\
1879     b= y;\
1880     x= a+b;\
1881     y= a-b;\
1882 }
1883
1884 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1885
1886 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1887     int i;
1888     int temp[64];
1889     int sum=0;
1890
1891     assert(h==8);
1892
1893     for(i=0; i<8; i++){
1894         //FIXME try pointer walks
1895         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1896         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1897         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1898         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1899
1900         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1901         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1902         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1903         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1904
1905         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1906         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1907         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1908         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1909     }
1910
1911     for(i=0; i<8; i++){
1912         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1913         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1914         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1915         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1916
1917         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1918         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1919         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1920         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1921
1922         sum +=
1923              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1924             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1925             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1926             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1927     }
1928     return sum;
1929 }
1930
1931 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1932     int i;
1933     int temp[64];
1934     int sum=0;
1935
1936     assert(h==8);
1937
1938     for(i=0; i<8; i++){
1939         //FIXME try pointer walks
1940         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
1941         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
1942         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
1943         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
1944
1945         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1946         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1947         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1948         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1949
1950         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1951         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1952         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1953         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1954     }
1955
1956     for(i=0; i<8; i++){
1957         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1958         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1959         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1960         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1961
1962         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1963         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1964         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1965         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1966
1967         sum +=
1968              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1969             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1970             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1971             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1972     }
1973
1974     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
1975
1976     return sum;
1977 }
1978
1979 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1980     MpegEncContext * const s= (MpegEncContext *)c;
1981     LOCAL_ALIGNED_16(int16_t, temp, [64]);
1982
1983     assert(h==8);
1984
1985     s->dsp.diff_pixels(temp, src1, src2, stride);
1986     s->dsp.fdct(temp);
1987     return s->dsp.sum_abs_dctelem(temp);
1988 }
1989
1990 #if CONFIG_GPL
1991 #define DCT8_1D {\
1992     const int s07 = SRC(0) + SRC(7);\
1993     const int s16 = SRC(1) + SRC(6);\
1994     const int s25 = SRC(2) + SRC(5);\
1995     const int s34 = SRC(3) + SRC(4);\
1996     const int a0 = s07 + s34;\
1997     const int a1 = s16 + s25;\
1998     const int a2 = s07 - s34;\
1999     const int a3 = s16 - s25;\
2000     const int d07 = SRC(0) - SRC(7);\
2001     const int d16 = SRC(1) - SRC(6);\
2002     const int d25 = SRC(2) - SRC(5);\
2003     const int d34 = SRC(3) - SRC(4);\
2004     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2005     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2006     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2007     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2008     DST(0,  a0 + a1     ) ;\
2009     DST(1,  a4 + (a7>>2)) ;\
2010     DST(2,  a2 + (a3>>1)) ;\
2011     DST(3,  a5 + (a6>>2)) ;\
2012     DST(4,  a0 - a1     ) ;\
2013     DST(5,  a6 - (a5>>2)) ;\
2014     DST(6, (a2>>1) - a3 ) ;\
2015     DST(7, (a4>>2) - a7 ) ;\
2016 }
2017
2018 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2019     MpegEncContext * const s= (MpegEncContext *)c;
2020     int16_t dct[8][8];
2021     int i;
2022     int sum=0;
2023
2024     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2025
2026 #define SRC(x) dct[i][x]
2027 #define DST(x,v) dct[i][x]= v
2028     for( i = 0; i < 8; i++ )
2029         DCT8_1D
2030 #undef SRC
2031 #undef DST
2032
2033 #define SRC(x) dct[x][i]
2034 #define DST(x,v) sum += FFABS(v)
2035     for( i = 0; i < 8; i++ )
2036         DCT8_1D
2037 #undef SRC
2038 #undef DST
2039     return sum;
2040 }
2041 #endif
2042
2043 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2044     MpegEncContext * const s= (MpegEncContext *)c;
2045     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2046     int sum=0, i;
2047
2048     assert(h==8);
2049
2050     s->dsp.diff_pixels(temp, src1, src2, stride);
2051     s->dsp.fdct(temp);
2052
2053     for(i=0; i<64; i++)
2054         sum= FFMAX(sum, FFABS(temp[i]));
2055
2056     return sum;
2057 }
2058
2059 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2060     MpegEncContext * const s= (MpegEncContext *)c;
2061     LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2062     int16_t * const bak = temp+64;
2063     int sum=0, i;
2064
2065     assert(h==8);
2066     s->mb_intra=0;
2067
2068     s->dsp.diff_pixels(temp, src1, src2, stride);
2069
2070     memcpy(bak, temp, 64*sizeof(int16_t));
2071
2072     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2073     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2074     ff_simple_idct_8(temp); //FIXME
2075
2076     for(i=0; i<64; i++)
2077         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2078
2079     return sum;
2080 }
2081
2082 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2083     MpegEncContext * const s= (MpegEncContext *)c;
2084     const uint8_t *scantable= s->intra_scantable.permutated;
2085     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2086     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2087     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2088     int i, last, run, bits, level, distortion, start_i;
2089     const int esc_length= s->ac_esc_length;
2090     uint8_t * length;
2091     uint8_t * last_length;
2092
2093     assert(h==8);
2094
2095     copy_block8(lsrc1, src1, 8, stride, 8);
2096     copy_block8(lsrc2, src2, 8, stride, 8);
2097
2098     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2099
2100     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2101
2102     bits=0;
2103
2104     if (s->mb_intra) {
2105         start_i = 1;
2106         length     = s->intra_ac_vlc_length;
2107         last_length= s->intra_ac_vlc_last_length;
2108         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2109     } else {
2110         start_i = 0;
2111         length     = s->inter_ac_vlc_length;
2112         last_length= s->inter_ac_vlc_last_length;
2113     }
2114
2115     if(last>=start_i){
2116         run=0;
2117         for(i=start_i; i<last; i++){
2118             int j= scantable[i];
2119             level= temp[j];
2120
2121             if(level){
2122                 level+=64;
2123                 if((level&(~127)) == 0){
2124                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2125                 }else
2126                     bits+= esc_length;
2127                 run=0;
2128             }else
2129                 run++;
2130         }
2131         i= scantable[last];
2132
2133         level= temp[i] + 64;
2134
2135         assert(level - 64);
2136
2137         if((level&(~127)) == 0){
2138             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2139         }else
2140             bits+= esc_length;
2141
2142     }
2143
2144     if(last>=0){
2145         if(s->mb_intra)
2146             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2147         else
2148             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2149     }
2150
2151     s->dsp.idct_add(lsrc2, 8, temp);
2152
2153     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2154
2155     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2156 }
2157
2158 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2159     MpegEncContext * const s= (MpegEncContext *)c;
2160     const uint8_t *scantable= s->intra_scantable.permutated;
2161     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2162     int i, last, run, bits, level, start_i;
2163     const int esc_length= s->ac_esc_length;
2164     uint8_t * length;
2165     uint8_t * last_length;
2166
2167     assert(h==8);
2168
2169     s->dsp.diff_pixels(temp, src1, src2, stride);
2170
2171     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2172
2173     bits=0;
2174
2175     if (s->mb_intra) {
2176         start_i = 1;
2177         length     = s->intra_ac_vlc_length;
2178         last_length= s->intra_ac_vlc_last_length;
2179         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2180     } else {
2181         start_i = 0;
2182         length     = s->inter_ac_vlc_length;
2183         last_length= s->inter_ac_vlc_last_length;
2184     }
2185
2186     if(last>=start_i){
2187         run=0;
2188         for(i=start_i; i<last; i++){
2189             int j= scantable[i];
2190             level= temp[j];
2191
2192             if(level){
2193                 level+=64;
2194                 if((level&(~127)) == 0){
2195                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2196                 }else
2197                     bits+= esc_length;
2198                 run=0;
2199             }else
2200                 run++;
2201         }
2202         i= scantable[last];
2203
2204         level= temp[i] + 64;
2205
2206         assert(level - 64);
2207
2208         if((level&(~127)) == 0){
2209             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2210         }else
2211             bits+= esc_length;
2212     }
2213
2214     return bits;
2215 }
2216
2217 #define VSAD_INTRA(size) \
2218 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2219     int score=0;                                                                                            \
2220     int x,y;                                                                                                \
2221                                                                                                             \
2222     for(y=1; y<h; y++){                                                                                     \
2223         for(x=0; x<size; x+=4){                                                                             \
2224             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2225                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2226         }                                                                                                   \
2227         s+= stride;                                                                                         \
2228     }                                                                                                       \
2229                                                                                                             \
2230     return score;                                                                                           \
2231 }
2232 VSAD_INTRA(8)
2233 VSAD_INTRA(16)
2234
2235 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2236     int score=0;
2237     int x,y;
2238
2239     for(y=1; y<h; y++){
2240         for(x=0; x<16; x++){
2241             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2242         }
2243         s1+= stride;
2244         s2+= stride;
2245     }
2246
2247     return score;
2248 }
2249
2250 #define SQ(a) ((a)*(a))
2251 #define VSSE_INTRA(size) \
2252 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2253     int score=0;                                                                                            \
2254     int x,y;                                                                                                \
2255                                                                                                             \
2256     for(y=1; y<h; y++){                                                                                     \
2257         for(x=0; x<size; x+=4){                                                                               \
2258             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2259                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2260         }                                                                                                   \
2261         s+= stride;                                                                                         \
2262     }                                                                                                       \
2263                                                                                                             \
2264     return score;                                                                                           \
2265 }
2266 VSSE_INTRA(8)
2267 VSSE_INTRA(16)
2268
2269 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2270     int score=0;
2271     int x,y;
2272
2273     for(y=1; y<h; y++){
2274         for(x=0; x<16; x++){
2275             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2276         }
2277         s1+= stride;
2278         s2+= stride;
2279     }
2280
2281     return score;
2282 }
2283
2284 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2285                                int size){
2286     int score=0;
2287     int i;
2288     for(i=0; i<size; i++)
2289         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2290     return score;
2291 }
2292
2293 #define WRAPPER8_16_SQ(name8, name16)\
2294 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2295     int score=0;\
2296     score +=name8(s, dst           , src           , stride, 8);\
2297     score +=name8(s, dst+8         , src+8         , stride, 8);\
2298     if(h==16){\
2299         dst += 8*stride;\
2300         src += 8*stride;\
2301         score +=name8(s, dst           , src           , stride, 8);\
2302         score +=name8(s, dst+8         , src+8         , stride, 8);\
2303     }\
2304     return score;\
2305 }
2306
2307 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2308 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2309 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2310 #if CONFIG_GPL
2311 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2312 #endif
2313 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2314 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2315 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2316 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2317
2318 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2319                    uint32_t maxi, uint32_t maxisign)
2320 {
2321
2322     if(a > mini) return mini;
2323     else if((a^(1U<<31)) > maxisign) return maxi;
2324     else return a;
2325 }
2326
2327 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2328     int i;
2329     uint32_t mini = *(uint32_t*)min;
2330     uint32_t maxi = *(uint32_t*)max;
2331     uint32_t maxisign = maxi ^ (1U<<31);
2332     uint32_t *dsti = (uint32_t*)dst;
2333     const uint32_t *srci = (const uint32_t*)src;
2334     for(i=0; i<len; i+=8) {
2335         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2336         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2337         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2338         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2339         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2340         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2341         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2342         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2343     }
2344 }
2345 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2346     int i;
2347     if(min < 0 && max > 0) {
2348         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2349     } else {
2350         for(i=0; i < len; i+=8) {
2351             dst[i    ] = av_clipf(src[i    ], min, max);
2352             dst[i + 1] = av_clipf(src[i + 1], min, max);
2353             dst[i + 2] = av_clipf(src[i + 2], min, max);
2354             dst[i + 3] = av_clipf(src[i + 3], min, max);
2355             dst[i + 4] = av_clipf(src[i + 4], min, max);
2356             dst[i + 5] = av_clipf(src[i + 5], min, max);
2357             dst[i + 6] = av_clipf(src[i + 6], min, max);
2358             dst[i + 7] = av_clipf(src[i + 7], min, max);
2359         }
2360     }
2361 }
2362
2363 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2364 {
2365     int res = 0;
2366
2367     while (order--)
2368         res += *v1++ * *v2++;
2369
2370     return res;
2371 }
2372
2373 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2374 {
2375     int res = 0;
2376     while (order--) {
2377         res   += *v1 * *v2++;
2378         *v1++ += mul * *v3++;
2379     }
2380     return res;
2381 }
2382
2383 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2384                                 int32_t max, unsigned int len)
2385 {
2386     do {
2387         *dst++ = av_clip(*src++, min, max);
2388         *dst++ = av_clip(*src++, min, max);
2389         *dst++ = av_clip(*src++, min, max);
2390         *dst++ = av_clip(*src++, min, max);
2391         *dst++ = av_clip(*src++, min, max);
2392         *dst++ = av_clip(*src++, min, max);
2393         *dst++ = av_clip(*src++, min, max);
2394         *dst++ = av_clip(*src++, min, max);
2395         len -= 8;
2396     } while (len > 0);
2397 }
2398
2399 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2400 {
2401     ff_j_rev_dct (block);
2402     put_pixels_clamped_c(block, dest, line_size);
2403 }
2404 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2405 {
2406     ff_j_rev_dct (block);
2407     add_pixels_clamped_c(block, dest, line_size);
2408 }
2409
2410 /* init static data */
2411 av_cold void ff_dsputil_static_init(void)
2412 {
2413     int i;
2414
2415     for(i=0;i<512;i++) {
2416         ff_squareTbl[i] = (i - 256) * (i - 256);
2417     }
2418 }
2419
2420 int ff_check_alignment(void){
2421     static int did_fail=0;
2422     LOCAL_ALIGNED_16(int, aligned, [4]);
2423
2424     if((intptr_t)aligned & 15){
2425         if(!did_fail){
2426 #if HAVE_MMX || HAVE_ALTIVEC
2427             av_log(NULL, AV_LOG_ERROR,
2428                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2429                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2430                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2431                 "Do not report crashes to Libav developers.\n");
2432 #endif
2433             did_fail=1;
2434         }
2435         return -1;
2436     }
2437     return 0;
2438 }
2439
2440 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2441 {
2442     ff_check_alignment();
2443
2444 #if CONFIG_ENCODERS
2445     if (avctx->bits_per_raw_sample == 10) {
2446         c->fdct    = ff_jpeg_fdct_islow_10;
2447         c->fdct248 = ff_fdct248_islow_10;
2448     } else {
2449         if(avctx->dct_algo==FF_DCT_FASTINT) {
2450             c->fdct    = ff_fdct_ifast;
2451             c->fdct248 = ff_fdct_ifast248;
2452         }
2453         else if(avctx->dct_algo==FF_DCT_FAAN) {
2454             c->fdct    = ff_faandct;
2455             c->fdct248 = ff_faandct248;
2456         }
2457         else {
2458             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2459             c->fdct248 = ff_fdct248_islow_8;
2460         }
2461     }
2462 #endif //CONFIG_ENCODERS
2463
2464     if (avctx->bits_per_raw_sample == 10) {
2465         c->idct_put              = ff_simple_idct_put_10;
2466         c->idct_add              = ff_simple_idct_add_10;
2467         c->idct                  = ff_simple_idct_10;
2468         c->idct_permutation_type = FF_NO_IDCT_PERM;
2469     } else {
2470         if(avctx->idct_algo==FF_IDCT_INT){
2471             c->idct_put= jref_idct_put;
2472             c->idct_add= jref_idct_add;
2473             c->idct    = ff_j_rev_dct;
2474             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2475         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2476             c->idct_put= ff_faanidct_put;
2477             c->idct_add= ff_faanidct_add;
2478             c->idct    = ff_faanidct;
2479             c->idct_permutation_type= FF_NO_IDCT_PERM;
2480         }else{ //accurate/default
2481             c->idct_put = ff_simple_idct_put_8;
2482             c->idct_add = ff_simple_idct_add_8;
2483             c->idct     = ff_simple_idct_8;
2484             c->idct_permutation_type= FF_NO_IDCT_PERM;
2485         }
2486     }
2487
2488     c->diff_pixels = diff_pixels_c;
2489     c->put_pixels_clamped = put_pixels_clamped_c;
2490     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2491     c->add_pixels_clamped = add_pixels_clamped_c;
2492     c->sum_abs_dctelem = sum_abs_dctelem_c;
2493     c->gmc1 = gmc1_c;
2494     c->gmc = ff_gmc_c;
2495     c->pix_sum = pix_sum_c;
2496     c->pix_norm1 = pix_norm1_c;
2497
2498     c->fill_block_tab[0] = fill_block16_c;
2499     c->fill_block_tab[1] = fill_block8_c;
2500
2501     /* TODO [0] 16  [1] 8 */
2502     c->pix_abs[0][0] = pix_abs16_c;
2503     c->pix_abs[0][1] = pix_abs16_x2_c;
2504     c->pix_abs[0][2] = pix_abs16_y2_c;
2505     c->pix_abs[0][3] = pix_abs16_xy2_c;
2506     c->pix_abs[1][0] = pix_abs8_c;
2507     c->pix_abs[1][1] = pix_abs8_x2_c;
2508     c->pix_abs[1][2] = pix_abs8_y2_c;
2509     c->pix_abs[1][3] = pix_abs8_xy2_c;
2510
2511     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2512     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2513     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2514     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2515     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2516     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2517     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2518     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2519     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2520
2521     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2522     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2523     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2524     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2525     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2526     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2527     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2528     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2529     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2530
2531 #define dspfunc(PFX, IDX, NUM) \
2532     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2533     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2534     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2535     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2536     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2537     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2538     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2539     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2540     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2541     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2542     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2543     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2544     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2545     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2546     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2547     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2548
2549     dspfunc(put_qpel, 0, 16);
2550     dspfunc(put_no_rnd_qpel, 0, 16);
2551
2552     dspfunc(avg_qpel, 0, 16);
2553     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2554
2555     dspfunc(put_qpel, 1, 8);
2556     dspfunc(put_no_rnd_qpel, 1, 8);
2557
2558     dspfunc(avg_qpel, 1, 8);
2559     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2560
2561 #undef dspfunc
2562
2563     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2564     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2565     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2566     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2567     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2568     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2569     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2570     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2571
2572 #define SET_CMP_FUNC(name) \
2573     c->name[0]= name ## 16_c;\
2574     c->name[1]= name ## 8x8_c;
2575
2576     SET_CMP_FUNC(hadamard8_diff)
2577     c->hadamard8_diff[4]= hadamard8_intra16_c;
2578     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2579     SET_CMP_FUNC(dct_sad)
2580     SET_CMP_FUNC(dct_max)
2581 #if CONFIG_GPL
2582     SET_CMP_FUNC(dct264_sad)
2583 #endif
2584     c->sad[0]= pix_abs16_c;
2585     c->sad[1]= pix_abs8_c;
2586     c->sse[0]= sse16_c;
2587     c->sse[1]= sse8_c;
2588     c->sse[2]= sse4_c;
2589     SET_CMP_FUNC(quant_psnr)
2590     SET_CMP_FUNC(rd)
2591     SET_CMP_FUNC(bit)
2592     c->vsad[0]= vsad16_c;
2593     c->vsad[4]= vsad_intra16_c;
2594     c->vsad[5]= vsad_intra8_c;
2595     c->vsse[0]= vsse16_c;
2596     c->vsse[4]= vsse_intra16_c;
2597     c->vsse[5]= vsse_intra8_c;
2598     c->nsse[0]= nsse16_c;
2599     c->nsse[1]= nsse8_c;
2600
2601     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2602
2603     c->add_bytes= add_bytes_c;
2604     c->diff_bytes= diff_bytes_c;
2605     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2606     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2607     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
2608     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2609     c->bswap_buf= bswap_buf;
2610     c->bswap16_buf = bswap16_buf;
2611
2612     c->try_8x8basis= try_8x8basis_c;
2613     c->add_8x8basis= add_8x8basis_c;
2614
2615     c->vector_clipf = vector_clipf_c;
2616     c->scalarproduct_int16 = scalarproduct_int16_c;
2617     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2618     c->vector_clip_int32 = vector_clip_int32_c;
2619
2620     c->shrink[0]= av_image_copy_plane;
2621     c->shrink[1]= ff_shrink22;
2622     c->shrink[2]= ff_shrink44;
2623     c->shrink[3]= ff_shrink88;
2624
2625     c->add_pixels8 = add_pixels8_c;
2626
2627 #undef FUNC
2628 #undef FUNCC
2629 #define FUNC(f, depth) f ## _ ## depth
2630 #define FUNCC(f, depth) f ## _ ## depth ## _c
2631
2632     c->draw_edges                    = FUNCC(draw_edges, 8);
2633     c->clear_block                   = FUNCC(clear_block, 8);
2634     c->clear_blocks                  = FUNCC(clear_blocks, 8);
2635
2636 #define BIT_DEPTH_FUNCS(depth) \
2637     c->get_pixels                    = FUNCC(get_pixels,   depth);
2638
2639     switch (avctx->bits_per_raw_sample) {
2640     case 9:
2641     case 10:
2642         BIT_DEPTH_FUNCS(16);
2643         break;
2644     default:
2645         BIT_DEPTH_FUNCS(8);
2646         break;
2647     }
2648
2649
2650     if (ARCH_ARM)
2651         ff_dsputil_init_arm(c, avctx);
2652     if (ARCH_BFIN)
2653         ff_dsputil_init_bfin(c, avctx);
2654     if (ARCH_PPC)
2655         ff_dsputil_init_ppc(c, avctx);
2656     if (ARCH_SH4)
2657         ff_dsputil_init_sh4(c, avctx);
2658     if (HAVE_VIS)
2659         ff_dsputil_init_vis(c, avctx);
2660     if (ARCH_X86)
2661         ff_dsputil_init_x86(c, avctx);
2662
2663     ff_init_scantable_permutation(c->idct_permutation,
2664                                   c->idct_permutation_type);
2665 }