git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/attributes.h"
  31 #include "libavutil/imgutils.h"
  32 #include "avcodec.h"
  33 #include "copy_block.h"
  34 #include "dct.h"
  35 #include "dsputil.h"
  36 #include "simple_idct.h"
  37 #include "faandct.h"
  38 #include "faanidct.h"
  39 #include "imgconvert.h"
  40 #include "mathops.h"
  41 #include "mpegvideo.h"
  42 #include "config.h"
  43
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 #define BIT_DEPTH 16
  47 #include "dsputil_template.c"
  48 #undef BIT_DEPTH
  49
  50 #define BIT_DEPTH 8
  51 #include "dsputil_template.c"
  52
  53 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  54 #define pb_7f (~0UL/255 * 0x7f)
  55 #define pb_80 (~0UL/255 * 0x80)
  56
  57 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  58    specification, we interleave the fields */
  59 const uint8_t ff_zigzag248_direct[64] = {
  60      0,  8,  1,  9, 16, 24,  2, 10,
  61     17, 25, 32, 40, 48, 56, 33, 41,
  62     18, 26,  3, 11,  4, 12, 19, 27,
  63     34, 42, 49, 57, 50, 58, 35, 43,
  64     20, 28,  5, 13,  6, 14, 21, 29,
  65     36, 44, 51, 59, 52, 60, 37, 45,
  66     22, 30,  7, 15, 23, 31, 38, 46,
  67     53, 61, 54, 62, 39, 47, 55, 63,
  68 };
  69
  70 const uint8_t ff_alternate_horizontal_scan[64] = {
  71     0,  1,   2,  3,  8,  9, 16, 17,
  72     10, 11,  4,  5,  6,  7, 15, 14,
  73     13, 12, 19, 18, 24, 25, 32, 33,
  74     26, 27, 20, 21, 22, 23, 28, 29,
  75     30, 31, 34, 35, 40, 41, 48, 49,
  76     42, 43, 36, 37, 38, 39, 44, 45,
  77     46, 47, 50, 51, 56, 57, 58, 59,
  78     52, 53, 54, 55, 60, 61, 62, 63,
  79 };
  80
  81 const uint8_t ff_alternate_vertical_scan[64] = {
  82     0,  8,  16, 24,  1,  9,  2, 10,
  83     17, 25, 32, 40, 48, 56, 57, 49,
  84     41, 33, 26, 18,  3, 11,  4, 12,
  85     19, 27, 34, 42, 50, 58, 35, 43,
  86     51, 59, 20, 28,  5, 13,  6, 14,
  87     21, 29, 36, 44, 52, 60, 37, 45,
  88     53, 61, 22, 30,  7, 15, 23, 31,
  89     38, 46, 54, 62, 39, 47, 55, 63,
  90 };
  91
  92 /* Input permutation for the simple_idct_mmx */
  93 static const uint8_t simple_mmx_permutation[64]={
  94         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
  95         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
  96         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
  97         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
  98         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
  99         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 100         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 101         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 102 };
 103
 104 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 105
 106 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
 107                                const uint8_t *src_scantable)
 108 {
 109     int i;
 110     int end;
 111
 112     st->scantable= src_scantable;
 113
 114     for(i=0; i<64; i++){
 115         int j;
 116         j = src_scantable[i];
 117         st->permutated[i] = permutation[j];
 118     }
 119
 120     end=-1;
 121     for(i=0; i<64; i++){
 122         int j;
 123         j = st->permutated[i];
 124         if(j>end) end=j;
 125         st->raster_end[i]= end;
 126     }
 127 }
 128
 129 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
 130                                            int idct_permutation_type)
 131 {
 132     int i;
 133
 134     switch(idct_permutation_type){
 135     case FF_NO_IDCT_PERM:
 136         for(i=0; i<64; i++)
 137             idct_permutation[i]= i;
 138         break;
 139     case FF_LIBMPEG2_IDCT_PERM:
 140         for(i=0; i<64; i++)
 141             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 142         break;
 143     case FF_SIMPLE_IDCT_PERM:
 144         for(i=0; i<64; i++)
 145             idct_permutation[i]= simple_mmx_permutation[i];
 146         break;
 147     case FF_TRANSPOSE_IDCT_PERM:
 148         for(i=0; i<64; i++)
 149             idct_permutation[i]= ((i&7)<<3) | (i>>3);
 150         break;
 151     case FF_PARTTRANS_IDCT_PERM:
 152         for(i=0; i<64; i++)
 153             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
 154         break;
 155     case FF_SSE2_IDCT_PERM:
 156         for(i=0; i<64; i++)
 157             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
 158         break;
 159     default:
 160         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
 161     }
 162 }
 163
 164 static int pix_sum_c(uint8_t * pix, int line_size)
 165 {
 166     int s, i, j;
 167
 168     s = 0;
 169     for (i = 0; i < 16; i++) {
 170         for (j = 0; j < 16; j += 8) {
 171             s += pix[0];
 172             s += pix[1];
 173             s += pix[2];
 174             s += pix[3];
 175             s += pix[4];
 176             s += pix[5];
 177             s += pix[6];
 178             s += pix[7];
 179             pix += 8;
 180         }
 181         pix += line_size - 16;
 182     }
 183     return s;
 184 }
 185
 186 static int pix_norm1_c(uint8_t * pix, int line_size)
 187 {
 188     int s, i, j;
 189     uint32_t *sq = ff_squareTbl + 256;
 190
 191     s = 0;
 192     for (i = 0; i < 16; i++) {
 193         for (j = 0; j < 16; j += 8) {
 194 #if 0
 195             s += sq[pix[0]];
 196             s += sq[pix[1]];
 197             s += sq[pix[2]];
 198             s += sq[pix[3]];
 199             s += sq[pix[4]];
 200             s += sq[pix[5]];
 201             s += sq[pix[6]];
 202             s += sq[pix[7]];
 203 #else
 204 #if HAVE_FAST_64BIT
 205             register uint64_t x=*(uint64_t*)pix;
 206             s += sq[x&0xff];
 207             s += sq[(x>>8)&0xff];
 208             s += sq[(x>>16)&0xff];
 209             s += sq[(x>>24)&0xff];
 210             s += sq[(x>>32)&0xff];
 211             s += sq[(x>>40)&0xff];
 212             s += sq[(x>>48)&0xff];
 213             s += sq[(x>>56)&0xff];
 214 #else
 215             register uint32_t x=*(uint32_t*)pix;
 216             s += sq[x&0xff];
 217             s += sq[(x>>8)&0xff];
 218             s += sq[(x>>16)&0xff];
 219             s += sq[(x>>24)&0xff];
 220             x=*(uint32_t*)(pix+4);
 221             s += sq[x&0xff];
 222             s += sq[(x>>8)&0xff];
 223             s += sq[(x>>16)&0xff];
 224             s += sq[(x>>24)&0xff];
 225 #endif
 226 #endif
 227             pix += 8;
 228         }
 229         pix += line_size - 16;
 230     }
 231     return s;
 232 }
 233
 234 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 235     int i;
 236
 237     for(i=0; i+8<=w; i+=8){
 238         dst[i+0]= av_bswap32(src[i+0]);
 239         dst[i+1]= av_bswap32(src[i+1]);
 240         dst[i+2]= av_bswap32(src[i+2]);
 241         dst[i+3]= av_bswap32(src[i+3]);
 242         dst[i+4]= av_bswap32(src[i+4]);
 243         dst[i+5]= av_bswap32(src[i+5]);
 244         dst[i+6]= av_bswap32(src[i+6]);
 245         dst[i+7]= av_bswap32(src[i+7]);
 246     }
 247     for(;i<w; i++){
 248         dst[i+0]= av_bswap32(src[i+0]);
 249     }
 250 }
 251
 252 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 253 {
 254     while (len--)
 255         *dst++ = av_bswap16(*src++);
 256 }
 257
 258 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 259 {
 260     int s, i;
 261     uint32_t *sq = ff_squareTbl + 256;
 262
 263     s = 0;
 264     for (i = 0; i < h; i++) {
 265         s += sq[pix1[0] - pix2[0]];
 266         s += sq[pix1[1] - pix2[1]];
 267         s += sq[pix1[2] - pix2[2]];
 268         s += sq[pix1[3] - pix2[3]];
 269         pix1 += line_size;
 270         pix2 += line_size;
 271     }
 272     return s;
 273 }
 274
 275 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 276 {
 277     int s, i;
 278     uint32_t *sq = ff_squareTbl + 256;
 279
 280     s = 0;
 281     for (i = 0; i < h; i++) {
 282         s += sq[pix1[0] - pix2[0]];
 283         s += sq[pix1[1] - pix2[1]];
 284         s += sq[pix1[2] - pix2[2]];
 285         s += sq[pix1[3] - pix2[3]];
 286         s += sq[pix1[4] - pix2[4]];
 287         s += sq[pix1[5] - pix2[5]];
 288         s += sq[pix1[6] - pix2[6]];
 289         s += sq[pix1[7] - pix2[7]];
 290         pix1 += line_size;
 291         pix2 += line_size;
 292     }
 293     return s;
 294 }
 295
 296 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 297 {
 298     int s, i;
 299     uint32_t *sq = ff_squareTbl + 256;
 300
 301     s = 0;
 302     for (i = 0; i < h; i++) {
 303         s += sq[pix1[ 0] - pix2[ 0]];
 304         s += sq[pix1[ 1] - pix2[ 1]];
 305         s += sq[pix1[ 2] - pix2[ 2]];
 306         s += sq[pix1[ 3] - pix2[ 3]];
 307         s += sq[pix1[ 4] - pix2[ 4]];
 308         s += sq[pix1[ 5] - pix2[ 5]];
 309         s += sq[pix1[ 6] - pix2[ 6]];
 310         s += sq[pix1[ 7] - pix2[ 7]];
 311         s += sq[pix1[ 8] - pix2[ 8]];
 312         s += sq[pix1[ 9] - pix2[ 9]];
 313         s += sq[pix1[10] - pix2[10]];
 314         s += sq[pix1[11] - pix2[11]];
 315         s += sq[pix1[12] - pix2[12]];
 316         s += sq[pix1[13] - pix2[13]];
 317         s += sq[pix1[14] - pix2[14]];
 318         s += sq[pix1[15] - pix2[15]];
 319
 320         pix1 += line_size;
 321         pix2 += line_size;
 322     }
 323     return s;
 324 }
 325
 326 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
 327                           const uint8_t *s2, int stride){
 328     int i;
 329
 330     /* read the pixels */
 331     for(i=0;i<8;i++) {
 332         block[0] = s1[0] - s2[0];
 333         block[1] = s1[1] - s2[1];
 334         block[2] = s1[2] - s2[2];
 335         block[3] = s1[3] - s2[3];
 336         block[4] = s1[4] - s2[4];
 337         block[5] = s1[5] - s2[5];
 338         block[6] = s1[6] - s2[6];
 339         block[7] = s1[7] - s2[7];
 340         s1 += stride;
 341         s2 += stride;
 342         block += 8;
 343     }
 344 }
 345
 346
 347 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 348                                  int line_size)
 349 {
 350     int i;
 351
 352     /* read the pixels */
 353     for(i=0;i<8;i++) {
 354         pixels[0] = av_clip_uint8(block[0]);
 355         pixels[1] = av_clip_uint8(block[1]);
 356         pixels[2] = av_clip_uint8(block[2]);
 357         pixels[3] = av_clip_uint8(block[3]);
 358         pixels[4] = av_clip_uint8(block[4]);
 359         pixels[5] = av_clip_uint8(block[5]);
 360         pixels[6] = av_clip_uint8(block[6]);
 361         pixels[7] = av_clip_uint8(block[7]);
 362
 363         pixels += line_size;
 364         block += 8;
 365     }
 366 }
 367
 368 static void put_signed_pixels_clamped_c(const int16_t *block,
 369                                         uint8_t *restrict pixels,
 370                                         int line_size)
 371 {
 372     int i, j;
 373
 374     for (i = 0; i < 8; i++) {
 375         for (j = 0; j < 8; j++) {
 376             if (*block < -128)
 377                 *pixels = 0;
 378             else if (*block > 127)
 379                 *pixels = 255;
 380             else
 381                 *pixels = (uint8_t)(*block + 128);
 382             block++;
 383             pixels++;
 384         }
 385         pixels += (line_size - 8);
 386     }
 387 }
 388
 389 static void add_pixels8_c(uint8_t *restrict pixels,
 390                           int16_t *block,
 391                           int line_size)
 392 {
 393     int i;
 394
 395     for(i=0;i<8;i++) {
 396         pixels[0] += block[0];
 397         pixels[1] += block[1];
 398         pixels[2] += block[2];
 399         pixels[3] += block[3];
 400         pixels[4] += block[4];
 401         pixels[5] += block[5];
 402         pixels[6] += block[6];
 403         pixels[7] += block[7];
 404         pixels += line_size;
 405         block += 8;
 406     }
 407 }
 408
 409 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
 410                                  int line_size)
 411 {
 412     int i;
 413
 414     /* read the pixels */
 415     for(i=0;i<8;i++) {
 416         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 417         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 418         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 419         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 420         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
 421         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
 422         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
 423         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
 424         pixels += line_size;
 425         block += 8;
 426     }
 427 }
 428
 429 static int sum_abs_dctelem_c(int16_t *block)
 430 {
 431     int sum=0, i;
 432     for(i=0; i<64; i++)
 433         sum+= FFABS(block[i]);
 434     return sum;
 435 }
 436
 437 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 438 {
 439     int i;
 440
 441     for (i = 0; i < h; i++) {
 442         memset(block, value, 16);
 443         block += line_size;
 444     }
 445 }
 446
 447 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 448 {
 449     int i;
 450
 451     for (i = 0; i < h; i++) {
 452         memset(block, value, 8);
 453         block += line_size;
 454     }
 455 }
 456
 457 #define avg2(a,b) ((a+b+1)>>1)
 458 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 459
 460 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 461 {
 462     const int A=(16-x16)*(16-y16);
 463     const int B=(   x16)*(16-y16);
 464     const int C=(16-x16)*(   y16);
 465     const int D=(   x16)*(   y16);
 466     int i;
 467
 468     for(i=0; i<h; i++)
 469     {
 470         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 471         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 472         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 473         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 474         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 475         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 476         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 477         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 478         dst+= stride;
 479         src+= stride;
 480     }
 481 }
 482
 483 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 484                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 485 {
 486     int y, vx, vy;
 487     const int s= 1<<shift;
 488
 489     width--;
 490     height--;
 491
 492     for(y=0; y<h; y++){
 493         int x;
 494
 495         vx= ox;
 496         vy= oy;
 497         for(x=0; x<8; x++){ //XXX FIXME optimize
 498             int src_x, src_y, frac_x, frac_y, index;
 499
 500             src_x= vx>>16;
 501             src_y= vy>>16;
 502             frac_x= src_x&(s-1);
 503             frac_y= src_y&(s-1);
 504             src_x>>=shift;
 505             src_y>>=shift;
 506
 507             if((unsigned)src_x < width){
 508                 if((unsigned)src_y < height){
 509                     index= src_x + src_y*stride;
 510                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 511                                            + src[index       +1]*   frac_x )*(s-frac_y)
 512                                         + (  src[index+stride  ]*(s-frac_x)
 513                                            + src[index+stride+1]*   frac_x )*   frac_y
 514                                         + r)>>(shift*2);
 515                 }else{
 516                     index= src_x + av_clip(src_y, 0, height)*stride;
 517                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 518                                           + src[index       +1]*   frac_x )*s
 519                                         + r)>>(shift*2);
 520                 }
 521             }else{
 522                 if((unsigned)src_y < height){
 523                     index= av_clip(src_x, 0, width) + src_y*stride;
 524                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 525                                            + src[index+stride  ]*   frac_y )*s
 526                                         + r)>>(shift*2);
 527                 }else{
 528                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 529                     dst[y*stride + x]=    src[index         ];
 530                 }
 531             }
 532
 533             vx+= dxx;
 534             vy+= dyx;
 535         }
 536         ox += dxy;
 537         oy += dyy;
 538     }
 539 }
 540
 541 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 542     switch(width){
 543     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 544     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 545     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 546     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 547     }
 548 }
 549
 550 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 551     int i,j;
 552     for (i=0; i < height; i++) {
 553       for (j=0; j < width; j++) {
 554         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 555       }
 556       src += stride;
 557       dst += stride;
 558     }
 559 }
 560
 561 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 562     int i,j;
 563     for (i=0; i < height; i++) {
 564       for (j=0; j < width; j++) {
 565         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 566       }
 567       src += stride;
 568       dst += stride;
 569     }
 570 }
 571
 572 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 573     int i,j;
 574     for (i=0; i < height; i++) {
 575       for (j=0; j < width; j++) {
 576         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 577       }
 578       src += stride;
 579       dst += stride;
 580     }
 581 }
 582
 583 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 584     int i,j;
 585     for (i=0; i < height; i++) {
 586       for (j=0; j < width; j++) {
 587         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 588       }
 589       src += stride;
 590       dst += stride;
 591     }
 592 }
 593
 594 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 595     int i,j;
 596     for (i=0; i < height; i++) {
 597       for (j=0; j < width; j++) {
 598         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 599       }
 600       src += stride;
 601       dst += stride;
 602     }
 603 }
 604
 605 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 606     int i,j;
 607     for (i=0; i < height; i++) {
 608       for (j=0; j < width; j++) {
 609         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 610       }
 611       src += stride;
 612       dst += stride;
 613     }
 614 }
 615
 616 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 617     int i,j;
 618     for (i=0; i < height; i++) {
 619       for (j=0; j < width; j++) {
 620         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 621       }
 622       src += stride;
 623       dst += stride;
 624     }
 625 }
 626
 627 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 628     int i,j;
 629     for (i=0; i < height; i++) {
 630       for (j=0; j < width; j++) {
 631         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 632       }
 633       src += stride;
 634       dst += stride;
 635     }
 636 }
 637
 638 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 639     switch(width){
 640     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 641     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 642     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 643     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 644     }
 645 }
 646
 647 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 648     int i,j;
 649     for (i=0; i < height; i++) {
 650       for (j=0; j < width; j++) {
 651         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 652       }
 653       src += stride;
 654       dst += stride;
 655     }
 656 }
 657
 658 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 659     int i,j;
 660     for (i=0; i < height; i++) {
 661       for (j=0; j < width; j++) {
 662         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 663       }
 664       src += stride;
 665       dst += stride;
 666     }
 667 }
 668
 669 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 670     int i,j;
 671     for (i=0; i < height; i++) {
 672       for (j=0; j < width; j++) {
 673         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 674       }
 675       src += stride;
 676       dst += stride;
 677     }
 678 }
 679
 680 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 681     int i,j;
 682     for (i=0; i < height; i++) {
 683       for (j=0; j < width; j++) {
 684         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 685       }
 686       src += stride;
 687       dst += stride;
 688     }
 689 }
 690
 691 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 692     int i,j;
 693     for (i=0; i < height; i++) {
 694       for (j=0; j < width; j++) {
 695         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 696       }
 697       src += stride;
 698       dst += stride;
 699     }
 700 }
 701
 702 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 703     int i,j;
 704     for (i=0; i < height; i++) {
 705       for (j=0; j < width; j++) {
 706         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 707       }
 708       src += stride;
 709       dst += stride;
 710     }
 711 }
 712
 713 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 714     int i,j;
 715     for (i=0; i < height; i++) {
 716       for (j=0; j < width; j++) {
 717         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 718       }
 719       src += stride;
 720       dst += stride;
 721     }
 722 }
 723
 724 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 725     int i,j;
 726     for (i=0; i < height; i++) {
 727       for (j=0; j < width; j++) {
 728         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 729       }
 730       src += stride;
 731       dst += stride;
 732     }
 733 }
 734
 735 #define QPEL_MC(r, OPNAME, RND, OP) \
 736 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 737     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 738     int i;\
 739     for(i=0; i<h; i++)\
 740     {\
 741         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 742         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 743         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 744         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 745         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 746         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 747         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 748         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 749         dst+=dstStride;\
 750         src+=srcStride;\
 751     }\
 752 }\
 753 \
 754 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 755     const int w=8;\
 756     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 757     int i;\
 758     for(i=0; i<w; i++)\
 759     {\
 760         const int src0= src[0*srcStride];\
 761         const int src1= src[1*srcStride];\
 762         const int src2= src[2*srcStride];\
 763         const int src3= src[3*srcStride];\
 764         const int src4= src[4*srcStride];\
 765         const int src5= src[5*srcStride];\
 766         const int src6= src[6*srcStride];\
 767         const int src7= src[7*srcStride];\
 768         const int src8= src[8*srcStride];\
 769         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 770         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 771         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 772         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 773         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 774         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 775         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 776         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 777         dst++;\
 778         src++;\
 779     }\
 780 }\
 781 \
 782 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 783     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 784     int i;\
 785     \
 786     for(i=0; i<h; i++)\
 787     {\
 788         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 789         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 790         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 791         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 792         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 793         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 794         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 795         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 796         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 797         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 798         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 799         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 800         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 801         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 802         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 803         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 804         dst+=dstStride;\
 805         src+=srcStride;\
 806     }\
 807 }\
 808 \
 809 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 810     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 811     int i;\
 812     const int w=16;\
 813     for(i=0; i<w; i++)\
 814     {\
 815         const int src0= src[0*srcStride];\
 816         const int src1= src[1*srcStride];\
 817         const int src2= src[2*srcStride];\
 818         const int src3= src[3*srcStride];\
 819         const int src4= src[4*srcStride];\
 820         const int src5= src[5*srcStride];\
 821         const int src6= src[6*srcStride];\
 822         const int src7= src[7*srcStride];\
 823         const int src8= src[8*srcStride];\
 824         const int src9= src[9*srcStride];\
 825         const int src10= src[10*srcStride];\
 826         const int src11= src[11*srcStride];\
 827         const int src12= src[12*srcStride];\
 828         const int src13= src[13*srcStride];\
 829         const int src14= src[14*srcStride];\
 830         const int src15= src[15*srcStride];\
 831         const int src16= src[16*srcStride];\
 832         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 833         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 834         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 835         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 836         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 837         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 838         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 839         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 840         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 841         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 842         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 843         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 844         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 845         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 846         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 847         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 848         dst++;\
 849         src++;\
 850     }\
 851 }\
 852 \
 853 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 854 {\
 855     uint8_t half[64];\
 856     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 857     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 858 }\
 859 \
 860 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 861 {\
 862     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 863 }\
 864 \
 865 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 866 {\
 867     uint8_t half[64];\
 868     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 869     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 870 }\
 871 \
 872 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 873 {\
 874     uint8_t full[16*9];\
 875     uint8_t half[64];\
 876     copy_block9(full, src, 16, stride, 9);\
 877     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 878     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 879 }\
 880 \
 881 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 882 {\
 883     uint8_t full[16*9];\
 884     copy_block9(full, src, 16, stride, 9);\
 885     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 886 }\
 887 \
 888 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 889 {\
 890     uint8_t full[16*9];\
 891     uint8_t half[64];\
 892     copy_block9(full, src, 16, stride, 9);\
 893     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 894     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 895 }\
 896 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 897 {\
 898     uint8_t full[16*9];\
 899     uint8_t halfH[72];\
 900     uint8_t halfV[64];\
 901     uint8_t halfHV[64];\
 902     copy_block9(full, src, 16, stride, 9);\
 903     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 904     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 905     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 906     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 907 }\
 908 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 909 {\
 910     uint8_t full[16*9];\
 911     uint8_t halfH[72];\
 912     uint8_t halfHV[64];\
 913     copy_block9(full, src, 16, stride, 9);\
 914     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 915     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 916     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 917     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 918 }\
 919 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 920 {\
 921     uint8_t full[16*9];\
 922     uint8_t halfH[72];\
 923     uint8_t halfV[64];\
 924     uint8_t halfHV[64];\
 925     copy_block9(full, src, 16, stride, 9);\
 926     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 927     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 928     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 929     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 930 }\
 931 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 932 {\
 933     uint8_t full[16*9];\
 934     uint8_t halfH[72];\
 935     uint8_t halfHV[64];\
 936     copy_block9(full, src, 16, stride, 9);\
 937     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 938     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 939     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 940     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 941 }\
 942 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 943 {\
 944     uint8_t full[16*9];\
 945     uint8_t halfH[72];\
 946     uint8_t halfV[64];\
 947     uint8_t halfHV[64];\
 948     copy_block9(full, src, 16, stride, 9);\
 949     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 950     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 951     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 952     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 953 }\
 954 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 955 {\
 956     uint8_t full[16*9];\
 957     uint8_t halfH[72];\
 958     uint8_t halfHV[64];\
 959     copy_block9(full, src, 16, stride, 9);\
 960     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 961     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 962     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 963     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 964 }\
 965 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 966 {\
 967     uint8_t full[16*9];\
 968     uint8_t halfH[72];\
 969     uint8_t halfV[64];\
 970     uint8_t halfHV[64];\
 971     copy_block9(full, src, 16, stride, 9);\
 972     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
 973     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 974     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 975     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 976 }\
 977 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 978 {\
 979     uint8_t full[16*9];\
 980     uint8_t halfH[72];\
 981     uint8_t halfHV[64];\
 982     copy_block9(full, src, 16, stride, 9);\
 983     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 984     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
 985     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 986     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 987 }\
 988 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 989 {\
 990     uint8_t halfH[72];\
 991     uint8_t halfHV[64];\
 992     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
 993     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 994     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 995 }\
 996 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 997 {\
 998     uint8_t halfH[72];\
 999     uint8_t halfHV[64];\
1000     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1001     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1002     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1003 }\
1004 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1005 {\
1006     uint8_t full[16*9];\
1007     uint8_t halfH[72];\
1008     uint8_t halfV[64];\
1009     uint8_t halfHV[64];\
1010     copy_block9(full, src, 16, stride, 9);\
1011     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1012     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1013     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1014     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1015 }\
1016 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1017 {\
1018     uint8_t full[16*9];\
1019     uint8_t halfH[72];\
1020     copy_block9(full, src, 16, stride, 9);\
1021     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1022     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1023     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1024 }\
1025 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1026 {\
1027     uint8_t full[16*9];\
1028     uint8_t halfH[72];\
1029     uint8_t halfV[64];\
1030     uint8_t halfHV[64];\
1031     copy_block9(full, src, 16, stride, 9);\
1032     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1033     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1034     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1035     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1036 }\
1037 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1038 {\
1039     uint8_t full[16*9];\
1040     uint8_t halfH[72];\
1041     copy_block9(full, src, 16, stride, 9);\
1042     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1043     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1044     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1045 }\
1046 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1047 {\
1048     uint8_t halfH[72];\
1049     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1050     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1051 }\
1052 \
1053 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1054 {\
1055     uint8_t half[256];\
1056     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1057     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1058 }\
1059 \
1060 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1061 {\
1062     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1063 }\
1064 \
1065 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1066 {\
1067     uint8_t half[256];\
1068     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1069     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1070 }\
1071 \
1072 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1073 {\
1074     uint8_t full[24*17];\
1075     uint8_t half[256];\
1076     copy_block17(full, src, 24, stride, 17);\
1077     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1078     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1079 }\
1080 \
1081 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1082 {\
1083     uint8_t full[24*17];\
1084     copy_block17(full, src, 24, stride, 17);\
1085     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1086 }\
1087 \
1088 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1089 {\
1090     uint8_t full[24*17];\
1091     uint8_t half[256];\
1092     copy_block17(full, src, 24, stride, 17);\
1093     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1094     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1095 }\
1096 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1097 {\
1098     uint8_t full[24*17];\
1099     uint8_t halfH[272];\
1100     uint8_t halfV[256];\
1101     uint8_t halfHV[256];\
1102     copy_block17(full, src, 24, stride, 17);\
1103     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1104     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1105     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1106     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1107 }\
1108 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1109 {\
1110     uint8_t full[24*17];\
1111     uint8_t halfH[272];\
1112     uint8_t halfHV[256];\
1113     copy_block17(full, src, 24, stride, 17);\
1114     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1115     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1116     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1117     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1118 }\
1119 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1120 {\
1121     uint8_t full[24*17];\
1122     uint8_t halfH[272];\
1123     uint8_t halfV[256];\
1124     uint8_t halfHV[256];\
1125     copy_block17(full, src, 24, stride, 17);\
1126     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1127     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1128     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1129     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1130 }\
1131 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1132 {\
1133     uint8_t full[24*17];\
1134     uint8_t halfH[272];\
1135     uint8_t halfHV[256];\
1136     copy_block17(full, src, 24, stride, 17);\
1137     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1138     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1139     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1140     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1141 }\
1142 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1143 {\
1144     uint8_t full[24*17];\
1145     uint8_t halfH[272];\
1146     uint8_t halfV[256];\
1147     uint8_t halfHV[256];\
1148     copy_block17(full, src, 24, stride, 17);\
1149     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1150     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1151     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1152     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1153 }\
1154 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1155 {\
1156     uint8_t full[24*17];\
1157     uint8_t halfH[272];\
1158     uint8_t halfHV[256];\
1159     copy_block17(full, src, 24, stride, 17);\
1160     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1161     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1162     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1163     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1164 }\
1165 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1166 {\
1167     uint8_t full[24*17];\
1168     uint8_t halfH[272];\
1169     uint8_t halfV[256];\
1170     uint8_t halfHV[256];\
1171     copy_block17(full, src, 24, stride, 17);\
1172     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1173     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1174     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1175     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1176 }\
1177 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1178 {\
1179     uint8_t full[24*17];\
1180     uint8_t halfH[272];\
1181     uint8_t halfHV[256];\
1182     copy_block17(full, src, 24, stride, 17);\
1183     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1184     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1185     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1186     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1187 }\
1188 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1189 {\
1190     uint8_t halfH[272];\
1191     uint8_t halfHV[256];\
1192     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1193     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1194     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1195 }\
1196 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1197 {\
1198     uint8_t halfH[272];\
1199     uint8_t halfHV[256];\
1200     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1201     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1202     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1203 }\
1204 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1205 {\
1206     uint8_t full[24*17];\
1207     uint8_t halfH[272];\
1208     uint8_t halfV[256];\
1209     uint8_t halfHV[256];\
1210     copy_block17(full, src, 24, stride, 17);\
1211     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1212     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1213     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1214     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1215 }\
1216 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1217 {\
1218     uint8_t full[24*17];\
1219     uint8_t halfH[272];\
1220     copy_block17(full, src, 24, stride, 17);\
1221     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1222     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1223     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1224 }\
1225 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1226 {\
1227     uint8_t full[24*17];\
1228     uint8_t halfH[272];\
1229     uint8_t halfV[256];\
1230     uint8_t halfHV[256];\
1231     copy_block17(full, src, 24, stride, 17);\
1232     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1233     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1234     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1235     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1236 }\
1237 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1238 {\
1239     uint8_t full[24*17];\
1240     uint8_t halfH[272];\
1241     copy_block17(full, src, 24, stride, 17);\
1242     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1243     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1244     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1245 }\
1246 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1247 {\
1248     uint8_t halfH[272];\
1249     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1250     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1251 }
1252
1253 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1254 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1255 #define op_put(a, b) a = cm[((b) + 16)>>5]
1256 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1257
1258 QPEL_MC(0, put_       , _       , op_put)
1259 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1260 QPEL_MC(0, avg_       , _       , op_avg)
1261
1262 #undef op_avg
1263 #undef op_put
1264 #undef op_put_no_rnd
1265
1266 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1267 {
1268     put_pixels8_8_c(dst, src, stride, 8);
1269 }
1270 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1271 {
1272     avg_pixels8_8_c(dst, src, stride, 8);
1273 }
1274 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1275 {
1276     put_pixels16_8_c(dst, src, stride, 16);
1277 }
1278 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1279 {
1280     avg_pixels16_8_c(dst, src, stride, 16);
1281 }
1282
1283 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1284 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1285 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1286 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1287 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1288 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1289
1290 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1291     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1292     int i;
1293
1294     for(i=0; i<h; i++){
1295         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1296         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1297         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1298         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1299         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1300         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1301         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1302         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1303         dst+=dstStride;
1304         src+=srcStride;
1305     }
1306 }
1307
1308 #if CONFIG_RV40_DECODER
1309 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1310 {
1311     put_pixels16_xy2_8_c(dst, src, stride, 16);
1312 }
1313 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1314 {
1315     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1316 }
1317 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1318 {
1319     put_pixels8_xy2_8_c(dst, src, stride, 8);
1320 }
1321 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1322 {
1323     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1324 }
1325 #endif /* CONFIG_RV40_DECODER */
1326
1327 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1328     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1329     int i;
1330
1331     for(i=0; i<w; i++){
1332         const int src_1= src[ -srcStride];
1333         const int src0 = src[0          ];
1334         const int src1 = src[  srcStride];
1335         const int src2 = src[2*srcStride];
1336         const int src3 = src[3*srcStride];
1337         const int src4 = src[4*srcStride];
1338         const int src5 = src[5*srcStride];
1339         const int src6 = src[6*srcStride];
1340         const int src7 = src[7*srcStride];
1341         const int src8 = src[8*srcStride];
1342         const int src9 = src[9*srcStride];
1343         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1344         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1345         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1346         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1347         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1348         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1349         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1350         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1351         src++;
1352         dst++;
1353     }
1354 }
1355
1356 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1357 {
1358     uint8_t half[64];
1359     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1360     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1361 }
1362
1363 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1364 {
1365     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1366 }
1367
1368 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1369 {
1370     uint8_t half[64];
1371     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1372     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1373 }
1374
1375 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1376 {
1377     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1378 }
1379
1380 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1381 {
1382     uint8_t halfH[88];
1383     uint8_t halfV[64];
1384     uint8_t halfHV[64];
1385     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1386     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1387     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1388     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1389 }
1390 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1391 {
1392     uint8_t halfH[88];
1393     uint8_t halfV[64];
1394     uint8_t halfHV[64];
1395     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1396     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1397     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1398     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1399 }
1400 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1401 {
1402     uint8_t halfH[88];
1403     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1404     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1405 }
1406
1407 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1408 {
1409     int s, i;
1410
1411     s = 0;
1412     for(i=0;i<h;i++) {
1413         s += abs(pix1[0] - pix2[0]);
1414         s += abs(pix1[1] - pix2[1]);
1415         s += abs(pix1[2] - pix2[2]);
1416         s += abs(pix1[3] - pix2[3]);
1417         s += abs(pix1[4] - pix2[4]);
1418         s += abs(pix1[5] - pix2[5]);
1419         s += abs(pix1[6] - pix2[6]);
1420         s += abs(pix1[7] - pix2[7]);
1421         s += abs(pix1[8] - pix2[8]);
1422         s += abs(pix1[9] - pix2[9]);
1423         s += abs(pix1[10] - pix2[10]);
1424         s += abs(pix1[11] - pix2[11]);
1425         s += abs(pix1[12] - pix2[12]);
1426         s += abs(pix1[13] - pix2[13]);
1427         s += abs(pix1[14] - pix2[14]);
1428         s += abs(pix1[15] - pix2[15]);
1429         pix1 += line_size;
1430         pix2 += line_size;
1431     }
1432     return s;
1433 }
1434
1435 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1436 {
1437     int s, i;
1438
1439     s = 0;
1440     for(i=0;i<h;i++) {
1441         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1442         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1443         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1444         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1445         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1446         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1447         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1448         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1449         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1450         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1451         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1452         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1453         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1454         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1455         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1456         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1457         pix1 += line_size;
1458         pix2 += line_size;
1459     }
1460     return s;
1461 }
1462
1463 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1464 {
1465     int s, i;
1466     uint8_t *pix3 = pix2 + line_size;
1467
1468     s = 0;
1469     for(i=0;i<h;i++) {
1470         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1471         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1472         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1473         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1474         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1475         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1476         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1477         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1478         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1479         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1480         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1481         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1482         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1483         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1484         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1485         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1486         pix1 += line_size;
1487         pix2 += line_size;
1488         pix3 += line_size;
1489     }
1490     return s;
1491 }
1492
1493 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1494 {
1495     int s, i;
1496     uint8_t *pix3 = pix2 + line_size;
1497
1498     s = 0;
1499     for(i=0;i<h;i++) {
1500         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1501         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1502         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1503         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1504         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1505         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1506         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1507         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1508         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1509         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1510         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1511         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1512         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1513         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1514         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1515         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1516         pix1 += line_size;
1517         pix2 += line_size;
1518         pix3 += line_size;
1519     }
1520     return s;
1521 }
1522
1523 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1524 {
1525     int s, i;
1526
1527     s = 0;
1528     for(i=0;i<h;i++) {
1529         s += abs(pix1[0] - pix2[0]);
1530         s += abs(pix1[1] - pix2[1]);
1531         s += abs(pix1[2] - pix2[2]);
1532         s += abs(pix1[3] - pix2[3]);
1533         s += abs(pix1[4] - pix2[4]);
1534         s += abs(pix1[5] - pix2[5]);
1535         s += abs(pix1[6] - pix2[6]);
1536         s += abs(pix1[7] - pix2[7]);
1537         pix1 += line_size;
1538         pix2 += line_size;
1539     }
1540     return s;
1541 }
1542
1543 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1544 {
1545     int s, i;
1546
1547     s = 0;
1548     for(i=0;i<h;i++) {
1549         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1550         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1551         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1552         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1553         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1554         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1555         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1556         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1557         pix1 += line_size;
1558         pix2 += line_size;
1559     }
1560     return s;
1561 }
1562
1563 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1564 {
1565     int s, i;
1566     uint8_t *pix3 = pix2 + line_size;
1567
1568     s = 0;
1569     for(i=0;i<h;i++) {
1570         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1571         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1572         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1573         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1574         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1575         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1576         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1577         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1578         pix1 += line_size;
1579         pix2 += line_size;
1580         pix3 += line_size;
1581     }
1582     return s;
1583 }
1584
1585 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1586 {
1587     int s, i;
1588     uint8_t *pix3 = pix2 + line_size;
1589
1590     s = 0;
1591     for(i=0;i<h;i++) {
1592         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1593         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1594         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1595         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1596         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1597         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1598         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1599         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1600         pix1 += line_size;
1601         pix2 += line_size;
1602         pix3 += line_size;
1603     }
1604     return s;
1605 }
1606
1607 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1608     MpegEncContext *c = v;
1609     int score1=0;
1610     int score2=0;
1611     int x,y;
1612
1613     for(y=0; y<h; y++){
1614         for(x=0; x<16; x++){
1615             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1616         }
1617         if(y+1<h){
1618             for(x=0; x<15; x++){
1619                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1620                              - s1[x+1] + s1[x+1+stride])
1621                         -FFABS(  s2[x  ] - s2[x  +stride]
1622                              - s2[x+1] + s2[x+1+stride]);
1623             }
1624         }
1625         s1+= stride;
1626         s2+= stride;
1627     }
1628
1629     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1630     else  return score1 + FFABS(score2)*8;
1631 }
1632
1633 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1634     MpegEncContext *c = v;
1635     int score1=0;
1636     int score2=0;
1637     int x,y;
1638
1639     for(y=0; y<h; y++){
1640         for(x=0; x<8; x++){
1641             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1642         }
1643         if(y+1<h){
1644             for(x=0; x<7; x++){
1645                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1646                              - s1[x+1] + s1[x+1+stride])
1647                         -FFABS(  s2[x  ] - s2[x  +stride]
1648                              - s2[x+1] + s2[x+1+stride]);
1649             }
1650         }
1651         s1+= stride;
1652         s2+= stride;
1653     }
1654
1655     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1656     else  return score1 + FFABS(score2)*8;
1657 }
1658
1659 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1660     int i;
1661     unsigned int sum=0;
1662
1663     for(i=0; i<8*8; i++){
1664         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1665         int w= weight[i];
1666         b>>= RECON_SHIFT;
1667         assert(-512<b && b<512);
1668
1669         sum += (w*b)*(w*b)>>4;
1670     }
1671     return sum>>2;
1672 }
1673
1674 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1675     int i;
1676
1677     for(i=0; i<8*8; i++){
1678         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1679     }
1680 }
1681
1682 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1683     return 0;
1684 }
1685
1686 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1687     int i;
1688
1689     memset(cmp, 0, sizeof(void*)*6);
1690
1691     for(i=0; i<6; i++){
1692         switch(type&0xFF){
1693         case FF_CMP_SAD:
1694             cmp[i]= c->sad[i];
1695             break;
1696         case FF_CMP_SATD:
1697             cmp[i]= c->hadamard8_diff[i];
1698             break;
1699         case FF_CMP_SSE:
1700             cmp[i]= c->sse[i];
1701             break;
1702         case FF_CMP_DCT:
1703             cmp[i]= c->dct_sad[i];
1704             break;
1705         case FF_CMP_DCT264:
1706             cmp[i]= c->dct264_sad[i];
1707             break;
1708         case FF_CMP_DCTMAX:
1709             cmp[i]= c->dct_max[i];
1710             break;
1711         case FF_CMP_PSNR:
1712             cmp[i]= c->quant_psnr[i];
1713             break;
1714         case FF_CMP_BIT:
1715             cmp[i]= c->bit[i];
1716             break;
1717         case FF_CMP_RD:
1718             cmp[i]= c->rd[i];
1719             break;
1720         case FF_CMP_VSAD:
1721             cmp[i]= c->vsad[i];
1722             break;
1723         case FF_CMP_VSSE:
1724             cmp[i]= c->vsse[i];
1725             break;
1726         case FF_CMP_ZERO:
1727             cmp[i]= zero_cmp;
1728             break;
1729         case FF_CMP_NSSE:
1730             cmp[i]= c->nsse[i];
1731             break;
1732         default:
1733             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1734         }
1735     }
1736 }
1737
1738 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1739     long i;
1740     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1741         long a = *(long*)(src+i);
1742         long b = *(long*)(dst+i);
1743         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1744     }
1745     for(; i<w; i++)
1746         dst[i+0] += src[i+0];
1747 }
1748
1749 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1750     long i;
1751 #if !HAVE_FAST_UNALIGNED
1752     if((long)src2 & (sizeof(long)-1)){
1753         for(i=0; i+7<w; i+=8){
1754             dst[i+0] = src1[i+0]-src2[i+0];
1755             dst[i+1] = src1[i+1]-src2[i+1];
1756             dst[i+2] = src1[i+2]-src2[i+2];
1757             dst[i+3] = src1[i+3]-src2[i+3];
1758             dst[i+4] = src1[i+4]-src2[i+4];
1759             dst[i+5] = src1[i+5]-src2[i+5];
1760             dst[i+6] = src1[i+6]-src2[i+6];
1761             dst[i+7] = src1[i+7]-src2[i+7];
1762         }
1763     }else
1764 #endif
1765     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1766         long a = *(long*)(src1+i);
1767         long b = *(long*)(src2+i);
1768         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1769     }
1770     for(; i<w; i++)
1771         dst[i+0] = src1[i+0]-src2[i+0];
1772 }
1773
1774 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1775     int i;
1776     uint8_t l, lt;
1777
1778     l= *left;
1779     lt= *left_top;
1780
1781     for(i=0; i<w; i++){
1782         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1783         lt= src1[i];
1784         dst[i]= l;
1785     }
1786
1787     *left= l;
1788     *left_top= lt;
1789 }
1790
1791 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1792     int i;
1793     uint8_t l, lt;
1794
1795     l= *left;
1796     lt= *left_top;
1797
1798     for(i=0; i<w; i++){
1799         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1800         lt= src1[i];
1801         l= src2[i];
1802         dst[i]= l - pred;
1803     }
1804
1805     *left= l;
1806     *left_top= lt;
1807 }
1808
1809 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1810     int i;
1811
1812     for(i=0; i<w-1; i++){
1813         acc+= src[i];
1814         dst[i]= acc;
1815         i++;
1816         acc+= src[i];
1817         dst[i]= acc;
1818     }
1819
1820     for(; i<w; i++){
1821         acc+= src[i];
1822         dst[i]= acc;
1823     }
1824
1825     return acc;
1826 }
1827
1828 #if HAVE_BIGENDIAN
1829 #define B 3
1830 #define G 2
1831 #define R 1
1832 #define A 0
1833 #else
1834 #define B 0
1835 #define G 1
1836 #define R 2
1837 #define A 3
1838 #endif
1839 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1840     int i;
1841     int r,g,b,a;
1842     r= *red;
1843     g= *green;
1844     b= *blue;
1845     a= *alpha;
1846
1847     for(i=0; i<w; i++){
1848         b+= src[4*i+B];
1849         g+= src[4*i+G];
1850         r+= src[4*i+R];
1851         a+= src[4*i+A];
1852
1853         dst[4*i+B]= b;
1854         dst[4*i+G]= g;
1855         dst[4*i+R]= r;
1856         dst[4*i+A]= a;
1857     }
1858
1859     *red= r;
1860     *green= g;
1861     *blue= b;
1862     *alpha= a;
1863 }
1864 #undef B
1865 #undef G
1866 #undef R
1867 #undef A
1868
1869 #define BUTTERFLY2(o1,o2,i1,i2) \
1870 o1= (i1)+(i2);\
1871 o2= (i1)-(i2);
1872
1873 #define BUTTERFLY1(x,y) \
1874 {\
1875     int a,b;\
1876     a= x;\
1877     b= y;\
1878     x= a+b;\
1879     y= a-b;\
1880 }
1881
1882 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1883
1884 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1885     int i;
1886     int temp[64];
1887     int sum=0;
1888
1889     assert(h==8);
1890
1891     for(i=0; i<8; i++){
1892         //FIXME try pointer walks
1893         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1894         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1895         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1896         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1897
1898         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1899         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1900         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1901         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1902
1903         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1904         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1905         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1906         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1907     }
1908
1909     for(i=0; i<8; i++){
1910         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1911         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1912         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1913         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1914
1915         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1916         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1917         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1918         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1919
1920         sum +=
1921              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1922             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1923             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1924             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1925     }
1926     return sum;
1927 }
1928
1929 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1930     int i;
1931     int temp[64];
1932     int sum=0;
1933
1934     assert(h==8);
1935
1936     for(i=0; i<8; i++){
1937         //FIXME try pointer walks
1938         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
1939         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
1940         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
1941         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
1942
1943         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1944         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1945         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1946         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1947
1948         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1949         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1950         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1951         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1952     }
1953
1954     for(i=0; i<8; i++){
1955         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1956         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1957         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1958         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1959
1960         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1961         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1962         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1963         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1964
1965         sum +=
1966              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1967             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1968             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1969             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1970     }
1971
1972     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
1973
1974     return sum;
1975 }
1976
1977 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1978     MpegEncContext * const s= (MpegEncContext *)c;
1979     LOCAL_ALIGNED_16(int16_t, temp, [64]);
1980
1981     assert(h==8);
1982
1983     s->dsp.diff_pixels(temp, src1, src2, stride);
1984     s->dsp.fdct(temp);
1985     return s->dsp.sum_abs_dctelem(temp);
1986 }
1987
1988 #if CONFIG_GPL
1989 #define DCT8_1D {\
1990     const int s07 = SRC(0) + SRC(7);\
1991     const int s16 = SRC(1) + SRC(6);\
1992     const int s25 = SRC(2) + SRC(5);\
1993     const int s34 = SRC(3) + SRC(4);\
1994     const int a0 = s07 + s34;\
1995     const int a1 = s16 + s25;\
1996     const int a2 = s07 - s34;\
1997     const int a3 = s16 - s25;\
1998     const int d07 = SRC(0) - SRC(7);\
1999     const int d16 = SRC(1) - SRC(6);\
2000     const int d25 = SRC(2) - SRC(5);\
2001     const int d34 = SRC(3) - SRC(4);\
2002     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2003     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2004     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2005     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2006     DST(0,  a0 + a1     ) ;\
2007     DST(1,  a4 + (a7>>2)) ;\
2008     DST(2,  a2 + (a3>>1)) ;\
2009     DST(3,  a5 + (a6>>2)) ;\
2010     DST(4,  a0 - a1     ) ;\
2011     DST(5,  a6 - (a5>>2)) ;\
2012     DST(6, (a2>>1) - a3 ) ;\
2013     DST(7, (a4>>2) - a7 ) ;\
2014 }
2015
2016 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2017     MpegEncContext * const s= (MpegEncContext *)c;
2018     int16_t dct[8][8];
2019     int i;
2020     int sum=0;
2021
2022     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2023
2024 #define SRC(x) dct[i][x]
2025 #define DST(x,v) dct[i][x]= v
2026     for( i = 0; i < 8; i++ )
2027         DCT8_1D
2028 #undef SRC
2029 #undef DST
2030
2031 #define SRC(x) dct[x][i]
2032 #define DST(x,v) sum += FFABS(v)
2033     for( i = 0; i < 8; i++ )
2034         DCT8_1D
2035 #undef SRC
2036 #undef DST
2037     return sum;
2038 }
2039 #endif
2040
2041 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2042     MpegEncContext * const s= (MpegEncContext *)c;
2043     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2044     int sum=0, i;
2045
2046     assert(h==8);
2047
2048     s->dsp.diff_pixels(temp, src1, src2, stride);
2049     s->dsp.fdct(temp);
2050
2051     for(i=0; i<64; i++)
2052         sum= FFMAX(sum, FFABS(temp[i]));
2053
2054     return sum;
2055 }
2056
2057 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2058     MpegEncContext * const s= (MpegEncContext *)c;
2059     LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2060     int16_t * const bak = temp+64;
2061     int sum=0, i;
2062
2063     assert(h==8);
2064     s->mb_intra=0;
2065
2066     s->dsp.diff_pixels(temp, src1, src2, stride);
2067
2068     memcpy(bak, temp, 64*sizeof(int16_t));
2069
2070     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2071     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2072     ff_simple_idct_8(temp); //FIXME
2073
2074     for(i=0; i<64; i++)
2075         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2076
2077     return sum;
2078 }
2079
2080 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2081     MpegEncContext * const s= (MpegEncContext *)c;
2082     const uint8_t *scantable= s->intra_scantable.permutated;
2083     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2084     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2085     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2086     int i, last, run, bits, level, distortion, start_i;
2087     const int esc_length= s->ac_esc_length;
2088     uint8_t * length;
2089     uint8_t * last_length;
2090
2091     assert(h==8);
2092
2093     copy_block8(lsrc1, src1, 8, stride, 8);
2094     copy_block8(lsrc2, src2, 8, stride, 8);
2095
2096     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2097
2098     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2099
2100     bits=0;
2101
2102     if (s->mb_intra) {
2103         start_i = 1;
2104         length     = s->intra_ac_vlc_length;
2105         last_length= s->intra_ac_vlc_last_length;
2106         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2107     } else {
2108         start_i = 0;
2109         length     = s->inter_ac_vlc_length;
2110         last_length= s->inter_ac_vlc_last_length;
2111     }
2112
2113     if(last>=start_i){
2114         run=0;
2115         for(i=start_i; i<last; i++){
2116             int j= scantable[i];
2117             level= temp[j];
2118
2119             if(level){
2120                 level+=64;
2121                 if((level&(~127)) == 0){
2122                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2123                 }else
2124                     bits+= esc_length;
2125                 run=0;
2126             }else
2127                 run++;
2128         }
2129         i= scantable[last];
2130
2131         level= temp[i] + 64;
2132
2133         assert(level - 64);
2134
2135         if((level&(~127)) == 0){
2136             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2137         }else
2138             bits+= esc_length;
2139
2140     }
2141
2142     if(last>=0){
2143         if(s->mb_intra)
2144             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2145         else
2146             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2147     }
2148
2149     s->dsp.idct_add(lsrc2, 8, temp);
2150
2151     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2152
2153     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2154 }
2155
2156 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2157     MpegEncContext * const s= (MpegEncContext *)c;
2158     const uint8_t *scantable= s->intra_scantable.permutated;
2159     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2160     int i, last, run, bits, level, start_i;
2161     const int esc_length= s->ac_esc_length;
2162     uint8_t * length;
2163     uint8_t * last_length;
2164
2165     assert(h==8);
2166
2167     s->dsp.diff_pixels(temp, src1, src2, stride);
2168
2169     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2170
2171     bits=0;
2172
2173     if (s->mb_intra) {
2174         start_i = 1;
2175         length     = s->intra_ac_vlc_length;
2176         last_length= s->intra_ac_vlc_last_length;
2177         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2178     } else {
2179         start_i = 0;
2180         length     = s->inter_ac_vlc_length;
2181         last_length= s->inter_ac_vlc_last_length;
2182     }
2183
2184     if(last>=start_i){
2185         run=0;
2186         for(i=start_i; i<last; i++){
2187             int j= scantable[i];
2188             level= temp[j];
2189
2190             if(level){
2191                 level+=64;
2192                 if((level&(~127)) == 0){
2193                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2194                 }else
2195                     bits+= esc_length;
2196                 run=0;
2197             }else
2198                 run++;
2199         }
2200         i= scantable[last];
2201
2202         level= temp[i] + 64;
2203
2204         assert(level - 64);
2205
2206         if((level&(~127)) == 0){
2207             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2208         }else
2209             bits+= esc_length;
2210     }
2211
2212     return bits;
2213 }
2214
2215 #define VSAD_INTRA(size) \
2216 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2217     int score=0;                                                                                            \
2218     int x,y;                                                                                                \
2219                                                                                                             \
2220     for(y=1; y<h; y++){                                                                                     \
2221         for(x=0; x<size; x+=4){                                                                             \
2222             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2223                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2224         }                                                                                                   \
2225         s+= stride;                                                                                         \
2226     }                                                                                                       \
2227                                                                                                             \
2228     return score;                                                                                           \
2229 }
2230 VSAD_INTRA(8)
2231 VSAD_INTRA(16)
2232
2233 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2234     int score=0;
2235     int x,y;
2236
2237     for(y=1; y<h; y++){
2238         for(x=0; x<16; x++){
2239             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2240         }
2241         s1+= stride;
2242         s2+= stride;
2243     }
2244
2245     return score;
2246 }
2247
2248 #define SQ(a) ((a)*(a))
2249 #define VSSE_INTRA(size) \
2250 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2251     int score=0;                                                                                            \
2252     int x,y;                                                                                                \
2253                                                                                                             \
2254     for(y=1; y<h; y++){                                                                                     \
2255         for(x=0; x<size; x+=4){                                                                               \
2256             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2257                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2258         }                                                                                                   \
2259         s+= stride;                                                                                         \
2260     }                                                                                                       \
2261                                                                                                             \
2262     return score;                                                                                           \
2263 }
2264 VSSE_INTRA(8)
2265 VSSE_INTRA(16)
2266
2267 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2268     int score=0;
2269     int x,y;
2270
2271     for(y=1; y<h; y++){
2272         for(x=0; x<16; x++){
2273             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2274         }
2275         s1+= stride;
2276         s2+= stride;
2277     }
2278
2279     return score;
2280 }
2281
2282 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2283                                int size){
2284     int score=0;
2285     int i;
2286     for(i=0; i<size; i++)
2287         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2288     return score;
2289 }
2290
2291 #define WRAPPER8_16_SQ(name8, name16)\
2292 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2293     int score=0;\
2294     score +=name8(s, dst           , src           , stride, 8);\
2295     score +=name8(s, dst+8         , src+8         , stride, 8);\
2296     if(h==16){\
2297         dst += 8*stride;\
2298         src += 8*stride;\
2299         score +=name8(s, dst           , src           , stride, 8);\
2300         score +=name8(s, dst+8         , src+8         , stride, 8);\
2301     }\
2302     return score;\
2303 }
2304
2305 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2306 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2307 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2308 #if CONFIG_GPL
2309 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2310 #endif
2311 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2312 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2313 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2314 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2315
2316 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2317                    uint32_t maxi, uint32_t maxisign)
2318 {
2319
2320     if(a > mini) return mini;
2321     else if((a^(1U<<31)) > maxisign) return maxi;
2322     else return a;
2323 }
2324
2325 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2326     int i;
2327     uint32_t mini = *(uint32_t*)min;
2328     uint32_t maxi = *(uint32_t*)max;
2329     uint32_t maxisign = maxi ^ (1U<<31);
2330     uint32_t *dsti = (uint32_t*)dst;
2331     const uint32_t *srci = (const uint32_t*)src;
2332     for(i=0; i<len; i+=8) {
2333         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2334         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2335         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2336         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2337         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2338         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2339         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2340         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2341     }
2342 }
2343 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2344     int i;
2345     if(min < 0 && max > 0) {
2346         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2347     } else {
2348         for(i=0; i < len; i+=8) {
2349             dst[i    ] = av_clipf(src[i    ], min, max);
2350             dst[i + 1] = av_clipf(src[i + 1], min, max);
2351             dst[i + 2] = av_clipf(src[i + 2], min, max);
2352             dst[i + 3] = av_clipf(src[i + 3], min, max);
2353             dst[i + 4] = av_clipf(src[i + 4], min, max);
2354             dst[i + 5] = av_clipf(src[i + 5], min, max);
2355             dst[i + 6] = av_clipf(src[i + 6], min, max);
2356             dst[i + 7] = av_clipf(src[i + 7], min, max);
2357         }
2358     }
2359 }
2360
2361 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2362 {
2363     int res = 0;
2364
2365     while (order--)
2366         res += *v1++ * *v2++;
2367
2368     return res;
2369 }
2370
2371 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2372 {
2373     int res = 0;
2374     while (order--) {
2375         res   += *v1 * *v2++;
2376         *v1++ += mul * *v3++;
2377     }
2378     return res;
2379 }
2380
2381 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2382                                 int32_t max, unsigned int len)
2383 {
2384     do {
2385         *dst++ = av_clip(*src++, min, max);
2386         *dst++ = av_clip(*src++, min, max);
2387         *dst++ = av_clip(*src++, min, max);
2388         *dst++ = av_clip(*src++, min, max);
2389         *dst++ = av_clip(*src++, min, max);
2390         *dst++ = av_clip(*src++, min, max);
2391         *dst++ = av_clip(*src++, min, max);
2392         *dst++ = av_clip(*src++, min, max);
2393         len -= 8;
2394     } while (len > 0);
2395 }
2396
2397 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2398 {
2399     ff_j_rev_dct (block);
2400     put_pixels_clamped_c(block, dest, line_size);
2401 }
2402 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2403 {
2404     ff_j_rev_dct (block);
2405     add_pixels_clamped_c(block, dest, line_size);
2406 }
2407
2408 /* init static data */
2409 av_cold void ff_dsputil_static_init(void)
2410 {
2411     int i;
2412
2413     for(i=0;i<512;i++) {
2414         ff_squareTbl[i] = (i - 256) * (i - 256);
2415     }
2416 }
2417
2418 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2419 {
2420 #if CONFIG_ENCODERS
2421     if (avctx->bits_per_raw_sample == 10) {
2422         c->fdct    = ff_jpeg_fdct_islow_10;
2423         c->fdct248 = ff_fdct248_islow_10;
2424     } else {
2425         if(avctx->dct_algo==FF_DCT_FASTINT) {
2426             c->fdct    = ff_fdct_ifast;
2427             c->fdct248 = ff_fdct_ifast248;
2428         }
2429         else if(avctx->dct_algo==FF_DCT_FAAN) {
2430             c->fdct    = ff_faandct;
2431             c->fdct248 = ff_faandct248;
2432         }
2433         else {
2434             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2435             c->fdct248 = ff_fdct248_islow_8;
2436         }
2437     }
2438 #endif //CONFIG_ENCODERS
2439
2440     if (avctx->bits_per_raw_sample == 10) {
2441         c->idct_put              = ff_simple_idct_put_10;
2442         c->idct_add              = ff_simple_idct_add_10;
2443         c->idct                  = ff_simple_idct_10;
2444         c->idct_permutation_type = FF_NO_IDCT_PERM;
2445     } else {
2446         if(avctx->idct_algo==FF_IDCT_INT){
2447             c->idct_put= jref_idct_put;
2448             c->idct_add= jref_idct_add;
2449             c->idct    = ff_j_rev_dct;
2450             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2451         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2452             c->idct_put= ff_faanidct_put;
2453             c->idct_add= ff_faanidct_add;
2454             c->idct    = ff_faanidct;
2455             c->idct_permutation_type= FF_NO_IDCT_PERM;
2456         }else{ //accurate/default
2457             c->idct_put = ff_simple_idct_put_8;
2458             c->idct_add = ff_simple_idct_add_8;
2459             c->idct     = ff_simple_idct_8;
2460             c->idct_permutation_type= FF_NO_IDCT_PERM;
2461         }
2462     }
2463
2464     c->diff_pixels = diff_pixels_c;
2465     c->put_pixels_clamped = put_pixels_clamped_c;
2466     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2467     c->add_pixels_clamped = add_pixels_clamped_c;
2468     c->sum_abs_dctelem = sum_abs_dctelem_c;
2469     c->gmc1 = gmc1_c;
2470     c->gmc = ff_gmc_c;
2471     c->pix_sum = pix_sum_c;
2472     c->pix_norm1 = pix_norm1_c;
2473
2474     c->fill_block_tab[0] = fill_block16_c;
2475     c->fill_block_tab[1] = fill_block8_c;
2476
2477     /* TODO [0] 16  [1] 8 */
2478     c->pix_abs[0][0] = pix_abs16_c;
2479     c->pix_abs[0][1] = pix_abs16_x2_c;
2480     c->pix_abs[0][2] = pix_abs16_y2_c;
2481     c->pix_abs[0][3] = pix_abs16_xy2_c;
2482     c->pix_abs[1][0] = pix_abs8_c;
2483     c->pix_abs[1][1] = pix_abs8_x2_c;
2484     c->pix_abs[1][2] = pix_abs8_y2_c;
2485     c->pix_abs[1][3] = pix_abs8_xy2_c;
2486
2487     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2488     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2489     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2490     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2491     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2492     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2493     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2494     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2495     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2496
2497     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2498     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2499     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2500     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2501     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2502     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2503     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2504     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2505     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2506
2507 #define dspfunc(PFX, IDX, NUM) \
2508     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2509     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2510     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2511     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2512     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2513     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2514     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2515     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2516     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2517     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2518     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2519     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2520     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2521     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2522     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2523     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2524
2525     dspfunc(put_qpel, 0, 16);
2526     dspfunc(put_no_rnd_qpel, 0, 16);
2527
2528     dspfunc(avg_qpel, 0, 16);
2529
2530     dspfunc(put_qpel, 1, 8);
2531     dspfunc(put_no_rnd_qpel, 1, 8);
2532
2533     dspfunc(avg_qpel, 1, 8);
2534
2535 #undef dspfunc
2536
2537     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2538     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2539     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2540     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2541     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2542     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2543     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2544     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2545
2546 #define SET_CMP_FUNC(name) \
2547     c->name[0]= name ## 16_c;\
2548     c->name[1]= name ## 8x8_c;
2549
2550     SET_CMP_FUNC(hadamard8_diff)
2551     c->hadamard8_diff[4]= hadamard8_intra16_c;
2552     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2553     SET_CMP_FUNC(dct_sad)
2554     SET_CMP_FUNC(dct_max)
2555 #if CONFIG_GPL
2556     SET_CMP_FUNC(dct264_sad)
2557 #endif
2558     c->sad[0]= pix_abs16_c;
2559     c->sad[1]= pix_abs8_c;
2560     c->sse[0]= sse16_c;
2561     c->sse[1]= sse8_c;
2562     c->sse[2]= sse4_c;
2563     SET_CMP_FUNC(quant_psnr)
2564     SET_CMP_FUNC(rd)
2565     SET_CMP_FUNC(bit)
2566     c->vsad[0]= vsad16_c;
2567     c->vsad[4]= vsad_intra16_c;
2568     c->vsad[5]= vsad_intra8_c;
2569     c->vsse[0]= vsse16_c;
2570     c->vsse[4]= vsse_intra16_c;
2571     c->vsse[5]= vsse_intra8_c;
2572     c->nsse[0]= nsse16_c;
2573     c->nsse[1]= nsse8_c;
2574
2575     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2576
2577     c->add_bytes= add_bytes_c;
2578     c->diff_bytes= diff_bytes_c;
2579     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2580     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2581     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
2582     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2583     c->bswap_buf= bswap_buf;
2584     c->bswap16_buf = bswap16_buf;
2585
2586     c->try_8x8basis= try_8x8basis_c;
2587     c->add_8x8basis= add_8x8basis_c;
2588
2589     c->vector_clipf = vector_clipf_c;
2590     c->scalarproduct_int16 = scalarproduct_int16_c;
2591     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2592     c->vector_clip_int32 = vector_clip_int32_c;
2593
2594     c->shrink[0]= av_image_copy_plane;
2595     c->shrink[1]= ff_shrink22;
2596     c->shrink[2]= ff_shrink44;
2597     c->shrink[3]= ff_shrink88;
2598
2599     c->add_pixels8 = add_pixels8_c;
2600
2601 #undef FUNC
2602 #undef FUNCC
2603 #define FUNC(f, depth) f ## _ ## depth
2604 #define FUNCC(f, depth) f ## _ ## depth ## _c
2605
2606     c->draw_edges                    = FUNCC(draw_edges, 8);
2607     c->clear_block                   = FUNCC(clear_block, 8);
2608     c->clear_blocks                  = FUNCC(clear_blocks, 8);
2609
2610 #define BIT_DEPTH_FUNCS(depth) \
2611     c->get_pixels                    = FUNCC(get_pixels,   depth);
2612
2613     switch (avctx->bits_per_raw_sample) {
2614     case 9:
2615     case 10:
2616         BIT_DEPTH_FUNCS(16);
2617         break;
2618     default:
2619         BIT_DEPTH_FUNCS(8);
2620         break;
2621     }
2622
2623
2624     if (ARCH_ARM)
2625         ff_dsputil_init_arm(c, avctx);
2626     if (ARCH_BFIN)
2627         ff_dsputil_init_bfin(c, avctx);
2628     if (ARCH_PPC)
2629         ff_dsputil_init_ppc(c, avctx);
2630     if (ARCH_X86)
2631         ff_dsputil_init_x86(c, avctx);
2632
2633     ff_init_scantable_permutation(c->idct_permutation,
2634                                   c->idct_permutation_type);
2635 }