git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/attributes.h"
  31 #include "libavutil/imgutils.h"
  32 #include "libavutil/internal.h"
  33 #include "avcodec.h"
  34 #include "copy_block.h"
  35 #include "dct.h"
  36 #include "dsputil.h"
  37 #include "simple_idct.h"
  38 #include "faandct.h"
  39 #include "faanidct.h"
  40 #include "imgconvert.h"
  41 #include "mathops.h"
  42 #include "mpegvideo.h"
  43 #include "config.h"
  44 #include "diracdsp.h"
  45
  46 uint32_t ff_squareTbl[512] = {0, };
  47
  48 #define BIT_DEPTH 16
  49 #include "dsputil_template.c"
  50 #undef BIT_DEPTH
  51
  52 #define BIT_DEPTH 8
  53 #include "dsputil_template.c"
  54
  55 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  56 #define pb_7f (~0UL/255 * 0x7f)
  57 #define pb_80 (~0UL/255 * 0x80)
  58
  59 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  60    specification, we interleave the fields */
  61 const uint8_t ff_zigzag248_direct[64] = {
  62      0,  8,  1,  9, 16, 24,  2, 10,
  63     17, 25, 32, 40, 48, 56, 33, 41,
  64     18, 26,  3, 11,  4, 12, 19, 27,
  65     34, 42, 49, 57, 50, 58, 35, 43,
  66     20, 28,  5, 13,  6, 14, 21, 29,
  67     36, 44, 51, 59, 52, 60, 37, 45,
  68     22, 30,  7, 15, 23, 31, 38, 46,
  69     53, 61, 54, 62, 39, 47, 55, 63,
  70 };
  71
  72 const uint8_t ff_alternate_horizontal_scan[64] = {
  73     0,  1,   2,  3,  8,  9, 16, 17,
  74     10, 11,  4,  5,  6,  7, 15, 14,
  75     13, 12, 19, 18, 24, 25, 32, 33,
  76     26, 27, 20, 21, 22, 23, 28, 29,
  77     30, 31, 34, 35, 40, 41, 48, 49,
  78     42, 43, 36, 37, 38, 39, 44, 45,
  79     46, 47, 50, 51, 56, 57, 58, 59,
  80     52, 53, 54, 55, 60, 61, 62, 63,
  81 };
  82
  83 const uint8_t ff_alternate_vertical_scan[64] = {
  84     0,  8,  16, 24,  1,  9,  2, 10,
  85     17, 25, 32, 40, 48, 56, 57, 49,
  86     41, 33, 26, 18,  3, 11,  4, 12,
  87     19, 27, 34, 42, 50, 58, 35, 43,
  88     51, 59, 20, 28,  5, 13,  6, 14,
  89     21, 29, 36, 44, 52, 60, 37, 45,
  90     53, 61, 22, 30,  7, 15, 23, 31,
  91     38, 46, 54, 62, 39, 47, 55, 63,
  92 };
  93
  94 /* Input permutation for the simple_idct_mmx */
  95 static const uint8_t simple_mmx_permutation[64]={
  96         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
  97         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
  98         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
  99         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 100         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 101         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 102         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 103         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 104 };
 105
 106 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 107
 108 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
 109                                const uint8_t *src_scantable)
 110 {
 111     int i;
 112     int end;
 113
 114     st->scantable= src_scantable;
 115
 116     for(i=0; i<64; i++){
 117         int j;
 118         j = src_scantable[i];
 119         st->permutated[i] = permutation[j];
 120     }
 121
 122     end=-1;
 123     for(i=0; i<64; i++){
 124         int j;
 125         j = st->permutated[i];
 126         if(j>end) end=j;
 127         st->raster_end[i]= end;
 128     }
 129 }
 130
 131 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
 132                                            int idct_permutation_type)
 133 {
 134     int i;
 135
 136     switch(idct_permutation_type){
 137     case FF_NO_IDCT_PERM:
 138         for(i=0; i<64; i++)
 139             idct_permutation[i]= i;
 140         break;
 141     case FF_LIBMPEG2_IDCT_PERM:
 142         for(i=0; i<64; i++)
 143             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 144         break;
 145     case FF_SIMPLE_IDCT_PERM:
 146         for(i=0; i<64; i++)
 147             idct_permutation[i]= simple_mmx_permutation[i];
 148         break;
 149     case FF_TRANSPOSE_IDCT_PERM:
 150         for(i=0; i<64; i++)
 151             idct_permutation[i]= ((i&7)<<3) | (i>>3);
 152         break;
 153     case FF_PARTTRANS_IDCT_PERM:
 154         for(i=0; i<64; i++)
 155             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
 156         break;
 157     case FF_SSE2_IDCT_PERM:
 158         for(i=0; i<64; i++)
 159             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
 160         break;
 161     default:
 162         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
 163     }
 164 }
 165
 166 static int pix_sum_c(uint8_t * pix, int line_size)
 167 {
 168     int s, i, j;
 169
 170     s = 0;
 171     for (i = 0; i < 16; i++) {
 172         for (j = 0; j < 16; j += 8) {
 173             s += pix[0];
 174             s += pix[1];
 175             s += pix[2];
 176             s += pix[3];
 177             s += pix[4];
 178             s += pix[5];
 179             s += pix[6];
 180             s += pix[7];
 181             pix += 8;
 182         }
 183         pix += line_size - 16;
 184     }
 185     return s;
 186 }
 187
 188 static int pix_norm1_c(uint8_t * pix, int line_size)
 189 {
 190     int s, i, j;
 191     uint32_t *sq = ff_squareTbl + 256;
 192
 193     s = 0;
 194     for (i = 0; i < 16; i++) {
 195         for (j = 0; j < 16; j += 8) {
 196 #if 0
 197             s += sq[pix[0]];
 198             s += sq[pix[1]];
 199             s += sq[pix[2]];
 200             s += sq[pix[3]];
 201             s += sq[pix[4]];
 202             s += sq[pix[5]];
 203             s += sq[pix[6]];
 204             s += sq[pix[7]];
 205 #else
 206 #if HAVE_FAST_64BIT
 207             register uint64_t x=*(uint64_t*)pix;
 208             s += sq[x&0xff];
 209             s += sq[(x>>8)&0xff];
 210             s += sq[(x>>16)&0xff];
 211             s += sq[(x>>24)&0xff];
 212             s += sq[(x>>32)&0xff];
 213             s += sq[(x>>40)&0xff];
 214             s += sq[(x>>48)&0xff];
 215             s += sq[(x>>56)&0xff];
 216 #else
 217             register uint32_t x=*(uint32_t*)pix;
 218             s += sq[x&0xff];
 219             s += sq[(x>>8)&0xff];
 220             s += sq[(x>>16)&0xff];
 221             s += sq[(x>>24)&0xff];
 222             x=*(uint32_t*)(pix+4);
 223             s += sq[x&0xff];
 224             s += sq[(x>>8)&0xff];
 225             s += sq[(x>>16)&0xff];
 226             s += sq[(x>>24)&0xff];
 227 #endif
 228 #endif
 229             pix += 8;
 230         }
 231         pix += line_size - 16;
 232     }
 233     return s;
 234 }
 235
 236 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 237     int i;
 238
 239     for(i=0; i+8<=w; i+=8){
 240         dst[i+0]= av_bswap32(src[i+0]);
 241         dst[i+1]= av_bswap32(src[i+1]);
 242         dst[i+2]= av_bswap32(src[i+2]);
 243         dst[i+3]= av_bswap32(src[i+3]);
 244         dst[i+4]= av_bswap32(src[i+4]);
 245         dst[i+5]= av_bswap32(src[i+5]);
 246         dst[i+6]= av_bswap32(src[i+6]);
 247         dst[i+7]= av_bswap32(src[i+7]);
 248     }
 249     for(;i<w; i++){
 250         dst[i+0]= av_bswap32(src[i+0]);
 251     }
 252 }
 253
 254 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 255 {
 256     while (len--)
 257         *dst++ = av_bswap16(*src++);
 258 }
 259
 260 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 261 {
 262     int s, i;
 263     uint32_t *sq = ff_squareTbl + 256;
 264
 265     s = 0;
 266     for (i = 0; i < h; i++) {
 267         s += sq[pix1[0] - pix2[0]];
 268         s += sq[pix1[1] - pix2[1]];
 269         s += sq[pix1[2] - pix2[2]];
 270         s += sq[pix1[3] - pix2[3]];
 271         pix1 += line_size;
 272         pix2 += line_size;
 273     }
 274     return s;
 275 }
 276
 277 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 278 {
 279     int s, i;
 280     uint32_t *sq = ff_squareTbl + 256;
 281
 282     s = 0;
 283     for (i = 0; i < h; i++) {
 284         s += sq[pix1[0] - pix2[0]];
 285         s += sq[pix1[1] - pix2[1]];
 286         s += sq[pix1[2] - pix2[2]];
 287         s += sq[pix1[3] - pix2[3]];
 288         s += sq[pix1[4] - pix2[4]];
 289         s += sq[pix1[5] - pix2[5]];
 290         s += sq[pix1[6] - pix2[6]];
 291         s += sq[pix1[7] - pix2[7]];
 292         pix1 += line_size;
 293         pix2 += line_size;
 294     }
 295     return s;
 296 }
 297
 298 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 299 {
 300     int s, i;
 301     uint32_t *sq = ff_squareTbl + 256;
 302
 303     s = 0;
 304     for (i = 0; i < h; i++) {
 305         s += sq[pix1[ 0] - pix2[ 0]];
 306         s += sq[pix1[ 1] - pix2[ 1]];
 307         s += sq[pix1[ 2] - pix2[ 2]];
 308         s += sq[pix1[ 3] - pix2[ 3]];
 309         s += sq[pix1[ 4] - pix2[ 4]];
 310         s += sq[pix1[ 5] - pix2[ 5]];
 311         s += sq[pix1[ 6] - pix2[ 6]];
 312         s += sq[pix1[ 7] - pix2[ 7]];
 313         s += sq[pix1[ 8] - pix2[ 8]];
 314         s += sq[pix1[ 9] - pix2[ 9]];
 315         s += sq[pix1[10] - pix2[10]];
 316         s += sq[pix1[11] - pix2[11]];
 317         s += sq[pix1[12] - pix2[12]];
 318         s += sq[pix1[13] - pix2[13]];
 319         s += sq[pix1[14] - pix2[14]];
 320         s += sq[pix1[15] - pix2[15]];
 321
 322         pix1 += line_size;
 323         pix2 += line_size;
 324     }
 325     return s;
 326 }
 327
 328 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
 329                           const uint8_t *s2, int stride){
 330     int i;
 331
 332     /* read the pixels */
 333     for(i=0;i<8;i++) {
 334         block[0] = s1[0] - s2[0];
 335         block[1] = s1[1] - s2[1];
 336         block[2] = s1[2] - s2[2];
 337         block[3] = s1[3] - s2[3];
 338         block[4] = s1[4] - s2[4];
 339         block[5] = s1[5] - s2[5];
 340         block[6] = s1[6] - s2[6];
 341         block[7] = s1[7] - s2[7];
 342         s1 += stride;
 343         s2 += stride;
 344         block += 8;
 345     }
 346 }
 347
 348 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
 349                                  int line_size)
 350 {
 351     int i;
 352
 353     /* read the pixels */
 354     for(i=0;i<8;i++) {
 355         pixels[0] = av_clip_uint8(block[0]);
 356         pixels[1] = av_clip_uint8(block[1]);
 357         pixels[2] = av_clip_uint8(block[2]);
 358         pixels[3] = av_clip_uint8(block[3]);
 359         pixels[4] = av_clip_uint8(block[4]);
 360         pixels[5] = av_clip_uint8(block[5]);
 361         pixels[6] = av_clip_uint8(block[6]);
 362         pixels[7] = av_clip_uint8(block[7]);
 363
 364         pixels += line_size;
 365         block += 8;
 366     }
 367 }
 368
 369 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
 370                                  int line_size)
 371 {
 372     int i;
 373
 374     /* read the pixels */
 375     for(i=0;i<4;i++) {
 376         pixels[0] = av_clip_uint8(block[0]);
 377         pixels[1] = av_clip_uint8(block[1]);
 378         pixels[2] = av_clip_uint8(block[2]);
 379         pixels[3] = av_clip_uint8(block[3]);
 380
 381         pixels += line_size;
 382         block += 8;
 383     }
 384 }
 385
 386 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
 387                                  int line_size)
 388 {
 389     int i;
 390
 391     /* read the pixels */
 392     for(i=0;i<2;i++) {
 393         pixels[0] = av_clip_uint8(block[0]);
 394         pixels[1] = av_clip_uint8(block[1]);
 395
 396         pixels += line_size;
 397         block += 8;
 398     }
 399 }
 400
 401 static void put_signed_pixels_clamped_c(const int16_t *block,
 402                                         uint8_t *av_restrict pixels,
 403                                         int line_size)
 404 {
 405     int i, j;
 406
 407     for (i = 0; i < 8; i++) {
 408         for (j = 0; j < 8; j++) {
 409             if (*block < -128)
 410                 *pixels = 0;
 411             else if (*block > 127)
 412                 *pixels = 255;
 413             else
 414                 *pixels = (uint8_t)(*block + 128);
 415             block++;
 416             pixels++;
 417         }
 418         pixels += (line_size - 8);
 419     }
 420 }
 421
 422 static void add_pixels8_c(uint8_t *av_restrict pixels,
 423                           int16_t *block,
 424                           int line_size)
 425 {
 426     int i;
 427
 428     for(i=0;i<8;i++) {
 429         pixels[0] += block[0];
 430         pixels[1] += block[1];
 431         pixels[2] += block[2];
 432         pixels[3] += block[3];
 433         pixels[4] += block[4];
 434         pixels[5] += block[5];
 435         pixels[6] += block[6];
 436         pixels[7] += block[7];
 437         pixels += line_size;
 438         block += 8;
 439     }
 440 }
 441
 442 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
 443                                  int line_size)
 444 {
 445     int i;
 446
 447     /* read the pixels */
 448     for(i=0;i<8;i++) {
 449         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 450         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 451         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 452         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 453         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
 454         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
 455         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
 456         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
 457         pixels += line_size;
 458         block += 8;
 459     }
 460 }
 461
 462 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
 463                           int line_size)
 464 {
 465     int i;
 466
 467     /* read the pixels */
 468     for(i=0;i<4;i++) {
 469         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 470         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 471         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
 472         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
 473         pixels += line_size;
 474         block += 8;
 475     }
 476 }
 477
 478 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
 479                           int line_size)
 480 {
 481     int i;
 482
 483     /* read the pixels */
 484     for(i=0;i<2;i++) {
 485         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
 486         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
 487         pixels += line_size;
 488         block += 8;
 489     }
 490 }
 491
 492 static int sum_abs_dctelem_c(int16_t *block)
 493 {
 494     int sum=0, i;
 495     for(i=0; i<64; i++)
 496         sum+= FFABS(block[i]);
 497     return sum;
 498 }
 499
 500 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 501 {
 502     int i;
 503
 504     for (i = 0; i < h; i++) {
 505         memset(block, value, 16);
 506         block += line_size;
 507     }
 508 }
 509
 510 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 511 {
 512     int i;
 513
 514     for (i = 0; i < h; i++) {
 515         memset(block, value, 8);
 516         block += line_size;
 517     }
 518 }
 519
 520 #define avg2(a,b) ((a+b+1)>>1)
 521 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 522
 523 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 524 {
 525     const int A=(16-x16)*(16-y16);
 526     const int B=(   x16)*(16-y16);
 527     const int C=(16-x16)*(   y16);
 528     const int D=(   x16)*(   y16);
 529     int i;
 530
 531     for(i=0; i<h; i++)
 532     {
 533         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 534         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 535         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 536         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 537         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 538         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 539         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 540         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 541         dst+= stride;
 542         src+= stride;
 543     }
 544 }
 545
 546 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 547                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 548 {
 549     int y, vx, vy;
 550     const int s= 1<<shift;
 551
 552     width--;
 553     height--;
 554
 555     for(y=0; y<h; y++){
 556         int x;
 557
 558         vx= ox;
 559         vy= oy;
 560         for(x=0; x<8; x++){ //XXX FIXME optimize
 561             int src_x, src_y, frac_x, frac_y, index;
 562
 563             src_x= vx>>16;
 564             src_y= vy>>16;
 565             frac_x= src_x&(s-1);
 566             frac_y= src_y&(s-1);
 567             src_x>>=shift;
 568             src_y>>=shift;
 569
 570             if((unsigned)src_x < width){
 571                 if((unsigned)src_y < height){
 572                     index= src_x + src_y*stride;
 573                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 574                                            + src[index       +1]*   frac_x )*(s-frac_y)
 575                                         + (  src[index+stride  ]*(s-frac_x)
 576                                            + src[index+stride+1]*   frac_x )*   frac_y
 577                                         + r)>>(shift*2);
 578                 }else{
 579                     index= src_x + av_clip(src_y, 0, height)*stride;
 580                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 581                                           + src[index       +1]*   frac_x )*s
 582                                         + r)>>(shift*2);
 583                 }
 584             }else{
 585                 if((unsigned)src_y < height){
 586                     index= av_clip(src_x, 0, width) + src_y*stride;
 587                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 588                                            + src[index+stride  ]*   frac_y )*s
 589                                         + r)>>(shift*2);
 590                 }else{
 591                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 592                     dst[y*stride + x]=    src[index         ];
 593                 }
 594             }
 595
 596             vx+= dxx;
 597             vy+= dyx;
 598         }
 599         ox += dxy;
 600         oy += dyy;
 601     }
 602 }
 603
 604 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 605     switch(width){
 606     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 607     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 608     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 609     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 610     }
 611 }
 612
 613 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 614     int i,j;
 615     for (i=0; i < height; i++) {
 616       for (j=0; j < width; j++) {
 617         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 618       }
 619       src += stride;
 620       dst += stride;
 621     }
 622 }
 623
 624 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 625     int i,j;
 626     for (i=0; i < height; i++) {
 627       for (j=0; j < width; j++) {
 628         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 629       }
 630       src += stride;
 631       dst += stride;
 632     }
 633 }
 634
 635 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 636     int i,j;
 637     for (i=0; i < height; i++) {
 638       for (j=0; j < width; j++) {
 639         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 640       }
 641       src += stride;
 642       dst += stride;
 643     }
 644 }
 645
 646 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 647     int i,j;
 648     for (i=0; i < height; i++) {
 649       for (j=0; j < width; j++) {
 650         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 651       }
 652       src += stride;
 653       dst += stride;
 654     }
 655 }
 656
 657 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 658     int i,j;
 659     for (i=0; i < height; i++) {
 660       for (j=0; j < width; j++) {
 661         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 662       }
 663       src += stride;
 664       dst += stride;
 665     }
 666 }
 667
 668 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 669     int i,j;
 670     for (i=0; i < height; i++) {
 671       for (j=0; j < width; j++) {
 672         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 673       }
 674       src += stride;
 675       dst += stride;
 676     }
 677 }
 678
 679 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 680     int i,j;
 681     for (i=0; i < height; i++) {
 682       for (j=0; j < width; j++) {
 683         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 684       }
 685       src += stride;
 686       dst += stride;
 687     }
 688 }
 689
 690 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 691     int i,j;
 692     for (i=0; i < height; i++) {
 693       for (j=0; j < width; j++) {
 694         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 695       }
 696       src += stride;
 697       dst += stride;
 698     }
 699 }
 700
 701 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 702     switch(width){
 703     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 704     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 705     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 706     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 707     }
 708 }
 709
 710 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 711     int i,j;
 712     for (i=0; i < height; i++) {
 713       for (j=0; j < width; j++) {
 714         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 715       }
 716       src += stride;
 717       dst += stride;
 718     }
 719 }
 720
 721 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 722     int i,j;
 723     for (i=0; i < height; i++) {
 724       for (j=0; j < width; j++) {
 725         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 726       }
 727       src += stride;
 728       dst += stride;
 729     }
 730 }
 731
 732 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 733     int i,j;
 734     for (i=0; i < height; i++) {
 735       for (j=0; j < width; j++) {
 736         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 737       }
 738       src += stride;
 739       dst += stride;
 740     }
 741 }
 742
 743 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 744     int i,j;
 745     for (i=0; i < height; i++) {
 746       for (j=0; j < width; j++) {
 747         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 748       }
 749       src += stride;
 750       dst += stride;
 751     }
 752 }
 753
 754 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 755     int i,j;
 756     for (i=0; i < height; i++) {
 757       for (j=0; j < width; j++) {
 758         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 759       }
 760       src += stride;
 761       dst += stride;
 762     }
 763 }
 764
 765 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 766     int i,j;
 767     for (i=0; i < height; i++) {
 768       for (j=0; j < width; j++) {
 769         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 770       }
 771       src += stride;
 772       dst += stride;
 773     }
 774 }
 775
 776 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 777     int i,j;
 778     for (i=0; i < height; i++) {
 779       for (j=0; j < width; j++) {
 780         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 781       }
 782       src += stride;
 783       dst += stride;
 784     }
 785 }
 786
 787 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 788     int i,j;
 789     for (i=0; i < height; i++) {
 790       for (j=0; j < width; j++) {
 791         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 792       }
 793       src += stride;
 794       dst += stride;
 795     }
 796 }
 797
 798 #define QPEL_MC(r, OPNAME, RND, OP) \
 799 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 800     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 801     int i;\
 802     for(i=0; i<h; i++)\
 803     {\
 804         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 805         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 806         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 807         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 808         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 809         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 810         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 811         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 812         dst+=dstStride;\
 813         src+=srcStride;\
 814     }\
 815 }\
 816 \
 817 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 818     const int w=8;\
 819     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 820     int i;\
 821     for(i=0; i<w; i++)\
 822     {\
 823         const int src0= src[0*srcStride];\
 824         const int src1= src[1*srcStride];\
 825         const int src2= src[2*srcStride];\
 826         const int src3= src[3*srcStride];\
 827         const int src4= src[4*srcStride];\
 828         const int src5= src[5*srcStride];\
 829         const int src6= src[6*srcStride];\
 830         const int src7= src[7*srcStride];\
 831         const int src8= src[8*srcStride];\
 832         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 833         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 834         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 835         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 836         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 837         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 838         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 839         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 840         dst++;\
 841         src++;\
 842     }\
 843 }\
 844 \
 845 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 846     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 847     int i;\
 848     \
 849     for(i=0; i<h; i++)\
 850     {\
 851         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 852         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 853         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 854         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 855         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 856         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 857         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 858         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 859         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 860         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 861         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 862         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 863         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 864         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 865         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 866         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 867         dst+=dstStride;\
 868         src+=srcStride;\
 869     }\
 870 }\
 871 \
 872 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 873     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 874     int i;\
 875     const int w=16;\
 876     for(i=0; i<w; i++)\
 877     {\
 878         const int src0= src[0*srcStride];\
 879         const int src1= src[1*srcStride];\
 880         const int src2= src[2*srcStride];\
 881         const int src3= src[3*srcStride];\
 882         const int src4= src[4*srcStride];\
 883         const int src5= src[5*srcStride];\
 884         const int src6= src[6*srcStride];\
 885         const int src7= src[7*srcStride];\
 886         const int src8= src[8*srcStride];\
 887         const int src9= src[9*srcStride];\
 888         const int src10= src[10*srcStride];\
 889         const int src11= src[11*srcStride];\
 890         const int src12= src[12*srcStride];\
 891         const int src13= src[13*srcStride];\
 892         const int src14= src[14*srcStride];\
 893         const int src15= src[15*srcStride];\
 894         const int src16= src[16*srcStride];\
 895         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 896         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 897         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 898         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 899         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 900         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 901         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 902         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 903         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 904         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 905         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 906         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 907         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 908         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 909         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 910         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 911         dst++;\
 912         src++;\
 913     }\
 914 }\
 915 \
 916 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 917 {\
 918     uint8_t half[64];\
 919     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 920     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 921 }\
 922 \
 923 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 924 {\
 925     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 926 }\
 927 \
 928 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 929 {\
 930     uint8_t half[64];\
 931     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 932     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 933 }\
 934 \
 935 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 936 {\
 937     uint8_t full[16*9];\
 938     uint8_t half[64];\
 939     copy_block9(full, src, 16, stride, 9);\
 940     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 941     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 942 }\
 943 \
 944 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 945 {\
 946     uint8_t full[16*9];\
 947     copy_block9(full, src, 16, stride, 9);\
 948     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 949 }\
 950 \
 951 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 952 {\
 953     uint8_t full[16*9];\
 954     uint8_t half[64];\
 955     copy_block9(full, src, 16, stride, 9);\
 956     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 957     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 958 }\
 959 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 960 {\
 961     uint8_t full[16*9];\
 962     uint8_t halfH[72];\
 963     uint8_t halfV[64];\
 964     uint8_t halfHV[64];\
 965     copy_block9(full, src, 16, stride, 9);\
 966     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 967     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 968     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 969     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 970 }\
 971 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 972 {\
 973     uint8_t full[16*9];\
 974     uint8_t halfH[72];\
 975     uint8_t halfHV[64];\
 976     copy_block9(full, src, 16, stride, 9);\
 977     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 978     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 979     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 980     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 981 }\
 982 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 983 {\
 984     uint8_t full[16*9];\
 985     uint8_t halfH[72];\
 986     uint8_t halfV[64];\
 987     uint8_t halfHV[64];\
 988     copy_block9(full, src, 16, stride, 9);\
 989     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 990     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
 991     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 992     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 993 }\
 994 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 995 {\
 996     uint8_t full[16*9];\
 997     uint8_t halfH[72];\
 998     uint8_t halfHV[64];\
 999     copy_block9(full, src, 16, stride, 9);\
1000     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1001     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1002     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1003     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1004 }\
1005 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1006 {\
1007     uint8_t full[16*9];\
1008     uint8_t halfH[72];\
1009     uint8_t halfV[64];\
1010     uint8_t halfHV[64];\
1011     copy_block9(full, src, 16, stride, 9);\
1012     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1013     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1014     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1015     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1016 }\
1017 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1018 {\
1019     uint8_t full[16*9];\
1020     uint8_t halfH[72];\
1021     uint8_t halfHV[64];\
1022     copy_block9(full, src, 16, stride, 9);\
1023     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1024     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1025     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1026     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1027 }\
1028 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1029 {\
1030     uint8_t full[16*9];\
1031     uint8_t halfH[72];\
1032     uint8_t halfV[64];\
1033     uint8_t halfHV[64];\
1034     copy_block9(full, src, 16, stride, 9);\
1035     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1036     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1037     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1038     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1039 }\
1040 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1041 {\
1042     uint8_t full[16*9];\
1043     uint8_t halfH[72];\
1044     uint8_t halfHV[64];\
1045     copy_block9(full, src, 16, stride, 9);\
1046     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1047     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1048     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1049     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1050 }\
1051 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1052 {\
1053     uint8_t halfH[72];\
1054     uint8_t halfHV[64];\
1055     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1056     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1057     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1058 }\
1059 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1060 {\
1061     uint8_t halfH[72];\
1062     uint8_t halfHV[64];\
1063     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1064     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1065     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1066 }\
1067 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1068 {\
1069     uint8_t full[16*9];\
1070     uint8_t halfH[72];\
1071     uint8_t halfV[64];\
1072     uint8_t halfHV[64];\
1073     copy_block9(full, src, 16, stride, 9);\
1074     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1075     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1076     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1077     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1078 }\
1079 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1080 {\
1081     uint8_t full[16*9];\
1082     uint8_t halfH[72];\
1083     copy_block9(full, src, 16, stride, 9);\
1084     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1085     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1086     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1087 }\
1088 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1089 {\
1090     uint8_t full[16*9];\
1091     uint8_t halfH[72];\
1092     uint8_t halfV[64];\
1093     uint8_t halfHV[64];\
1094     copy_block9(full, src, 16, stride, 9);\
1095     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1096     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1097     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1098     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1099 }\
1100 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1101 {\
1102     uint8_t full[16*9];\
1103     uint8_t halfH[72];\
1104     copy_block9(full, src, 16, stride, 9);\
1105     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1106     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1107     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1108 }\
1109 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1110 {\
1111     uint8_t halfH[72];\
1112     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1113     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1114 }\
1115 \
1116 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1117 {\
1118     uint8_t half[256];\
1119     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1120     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1121 }\
1122 \
1123 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1124 {\
1125     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1126 }\
1127 \
1128 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1129 {\
1130     uint8_t half[256];\
1131     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1132     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1133 }\
1134 \
1135 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1136 {\
1137     uint8_t full[24*17];\
1138     uint8_t half[256];\
1139     copy_block17(full, src, 24, stride, 17);\
1140     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1141     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1142 }\
1143 \
1144 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1145 {\
1146     uint8_t full[24*17];\
1147     copy_block17(full, src, 24, stride, 17);\
1148     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1149 }\
1150 \
1151 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1152 {\
1153     uint8_t full[24*17];\
1154     uint8_t half[256];\
1155     copy_block17(full, src, 24, stride, 17);\
1156     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1157     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1158 }\
1159 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1160 {\
1161     uint8_t full[24*17];\
1162     uint8_t halfH[272];\
1163     uint8_t halfV[256];\
1164     uint8_t halfHV[256];\
1165     copy_block17(full, src, 24, stride, 17);\
1166     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1167     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1168     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1169     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1170 }\
1171 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1172 {\
1173     uint8_t full[24*17];\
1174     uint8_t halfH[272];\
1175     uint8_t halfHV[256];\
1176     copy_block17(full, src, 24, stride, 17);\
1177     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1178     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1179     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1180     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1181 }\
1182 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1183 {\
1184     uint8_t full[24*17];\
1185     uint8_t halfH[272];\
1186     uint8_t halfV[256];\
1187     uint8_t halfHV[256];\
1188     copy_block17(full, src, 24, stride, 17);\
1189     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1190     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1191     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1192     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1193 }\
1194 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1195 {\
1196     uint8_t full[24*17];\
1197     uint8_t halfH[272];\
1198     uint8_t halfHV[256];\
1199     copy_block17(full, src, 24, stride, 17);\
1200     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1201     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1202     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1203     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1204 }\
1205 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1206 {\
1207     uint8_t full[24*17];\
1208     uint8_t halfH[272];\
1209     uint8_t halfV[256];\
1210     uint8_t halfHV[256];\
1211     copy_block17(full, src, 24, stride, 17);\
1212     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1213     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1214     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1215     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1216 }\
1217 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1218 {\
1219     uint8_t full[24*17];\
1220     uint8_t halfH[272];\
1221     uint8_t halfHV[256];\
1222     copy_block17(full, src, 24, stride, 17);\
1223     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1224     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1225     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1226     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1227 }\
1228 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1229 {\
1230     uint8_t full[24*17];\
1231     uint8_t halfH[272];\
1232     uint8_t halfV[256];\
1233     uint8_t halfHV[256];\
1234     copy_block17(full, src, 24, stride, 17);\
1235     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1236     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1237     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1238     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1239 }\
1240 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1241 {\
1242     uint8_t full[24*17];\
1243     uint8_t halfH[272];\
1244     uint8_t halfHV[256];\
1245     copy_block17(full, src, 24, stride, 17);\
1246     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1247     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1248     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1249     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1250 }\
1251 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1252 {\
1253     uint8_t halfH[272];\
1254     uint8_t halfHV[256];\
1255     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1256     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1257     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1258 }\
1259 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1260 {\
1261     uint8_t halfH[272];\
1262     uint8_t halfHV[256];\
1263     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1264     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1265     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1266 }\
1267 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1268 {\
1269     uint8_t full[24*17];\
1270     uint8_t halfH[272];\
1271     uint8_t halfV[256];\
1272     uint8_t halfHV[256];\
1273     copy_block17(full, src, 24, stride, 17);\
1274     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1275     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1276     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1277     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1278 }\
1279 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1280 {\
1281     uint8_t full[24*17];\
1282     uint8_t halfH[272];\
1283     copy_block17(full, src, 24, stride, 17);\
1284     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1285     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1286     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1287 }\
1288 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1289 {\
1290     uint8_t full[24*17];\
1291     uint8_t halfH[272];\
1292     uint8_t halfV[256];\
1293     uint8_t halfHV[256];\
1294     copy_block17(full, src, 24, stride, 17);\
1295     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1296     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1297     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1298     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1299 }\
1300 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1301 {\
1302     uint8_t full[24*17];\
1303     uint8_t halfH[272];\
1304     copy_block17(full, src, 24, stride, 17);\
1305     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1306     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1307     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1308 }\
1309 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1310 {\
1311     uint8_t halfH[272];\
1312     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1313     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1314 }
1315
1316 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1317 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1318 #define op_put(a, b) a = cm[((b) + 16)>>5]
1319 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1320
1321 QPEL_MC(0, put_       , _       , op_put)
1322 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1323 QPEL_MC(0, avg_       , _       , op_avg)
1324 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1325 #undef op_avg
1326 #undef op_avg_no_rnd
1327 #undef op_put
1328 #undef op_put_no_rnd
1329
1330 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1331 {
1332     put_pixels8_8_c(dst, src, stride, 8);
1333 }
1334 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1335 {
1336     avg_pixels8_8_c(dst, src, stride, 8);
1337 }
1338 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1339 {
1340     put_pixels16_8_c(dst, src, stride, 16);
1341 }
1342 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1343 {
1344     avg_pixels16_8_c(dst, src, stride, 16);
1345 }
1346
1347 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1348 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1349 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1350 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1351 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1352 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1353
1354 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1355     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1356     int i;
1357
1358     for(i=0; i<h; i++){
1359         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1360         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1361         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1362         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1363         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1364         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1365         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1366         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1367         dst+=dstStride;
1368         src+=srcStride;
1369     }
1370 }
1371
1372 #if CONFIG_RV40_DECODER
1373 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1374 {
1375     put_pixels16_xy2_8_c(dst, src, stride, 16);
1376 }
1377 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1378 {
1379     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1380 }
1381 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1382 {
1383     put_pixels8_xy2_8_c(dst, src, stride, 8);
1384 }
1385 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1386 {
1387     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1388 }
1389 #endif /* CONFIG_RV40_DECODER */
1390
1391 #if CONFIG_DIRAC_DECODER
1392 #define DIRAC_MC(OPNAME)\
1393 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1394 {\
1395      OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1396 }\
1397 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1398 {\
1399     OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1400 }\
1401 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1402 {\
1403     OPNAME ## _pixels16_8_c(dst   , src[0]   , stride, h);\
1404     OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1405 }\
1406 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1407 {\
1408     OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1409 }\
1410 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1411 {\
1412     OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1413 }\
1414 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1415 {\
1416     OPNAME ## _pixels16_l2_8(dst   , src[0]   , src[1]   , stride, stride, stride, h);\
1417     OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1418 }\
1419 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1420 {\
1421     OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1422 }\
1423 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1424 {\
1425     OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1426 }\
1427 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1428 {\
1429     OPNAME ## _pixels16_l4_8(dst   , src[0]   , src[1]   , src[2]   , src[3]   , stride, stride, stride, stride, stride, h);\
1430     OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1431 }
1432 DIRAC_MC(put)
1433 DIRAC_MC(avg)
1434 #endif
1435
1436 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1437     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1438     int i;
1439
1440     for(i=0; i<w; i++){
1441         const int src_1= src[ -srcStride];
1442         const int src0 = src[0          ];
1443         const int src1 = src[  srcStride];
1444         const int src2 = src[2*srcStride];
1445         const int src3 = src[3*srcStride];
1446         const int src4 = src[4*srcStride];
1447         const int src5 = src[5*srcStride];
1448         const int src6 = src[6*srcStride];
1449         const int src7 = src[7*srcStride];
1450         const int src8 = src[8*srcStride];
1451         const int src9 = src[9*srcStride];
1452         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1453         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1454         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1455         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1456         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1457         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1458         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1459         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1460         src++;
1461         dst++;
1462     }
1463 }
1464
1465 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1466 {
1467     uint8_t half[64];
1468     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1469     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1470 }
1471
1472 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1473 {
1474     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1475 }
1476
1477 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1478 {
1479     uint8_t half[64];
1480     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1481     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1482 }
1483
1484 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1485 {
1486     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1487 }
1488
1489 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1490 {
1491     uint8_t halfH[88];
1492     uint8_t halfV[64];
1493     uint8_t halfHV[64];
1494     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1495     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1496     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1497     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1498 }
1499 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1500 {
1501     uint8_t halfH[88];
1502     uint8_t halfV[64];
1503     uint8_t halfHV[64];
1504     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1505     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1506     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1507     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1508 }
1509 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1510 {
1511     uint8_t halfH[88];
1512     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1513     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1514 }
1515
1516 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1517 {
1518     int s, i;
1519
1520     s = 0;
1521     for(i=0;i<h;i++) {
1522         s += abs(pix1[0] - pix2[0]);
1523         s += abs(pix1[1] - pix2[1]);
1524         s += abs(pix1[2] - pix2[2]);
1525         s += abs(pix1[3] - pix2[3]);
1526         s += abs(pix1[4] - pix2[4]);
1527         s += abs(pix1[5] - pix2[5]);
1528         s += abs(pix1[6] - pix2[6]);
1529         s += abs(pix1[7] - pix2[7]);
1530         s += abs(pix1[8] - pix2[8]);
1531         s += abs(pix1[9] - pix2[9]);
1532         s += abs(pix1[10] - pix2[10]);
1533         s += abs(pix1[11] - pix2[11]);
1534         s += abs(pix1[12] - pix2[12]);
1535         s += abs(pix1[13] - pix2[13]);
1536         s += abs(pix1[14] - pix2[14]);
1537         s += abs(pix1[15] - pix2[15]);
1538         pix1 += line_size;
1539         pix2 += line_size;
1540     }
1541     return s;
1542 }
1543
1544 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1545 {
1546     int s, i;
1547
1548     s = 0;
1549     for(i=0;i<h;i++) {
1550         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1551         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1552         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1553         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1554         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1555         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1556         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1557         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1558         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1559         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1560         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1561         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1562         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1563         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1564         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1565         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1566         pix1 += line_size;
1567         pix2 += line_size;
1568     }
1569     return s;
1570 }
1571
1572 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1573 {
1574     int s, i;
1575     uint8_t *pix3 = pix2 + line_size;
1576
1577     s = 0;
1578     for(i=0;i<h;i++) {
1579         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1580         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1581         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1582         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1583         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1584         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1585         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1586         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1587         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1588         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1589         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1590         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1591         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1592         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1593         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1594         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1595         pix1 += line_size;
1596         pix2 += line_size;
1597         pix3 += line_size;
1598     }
1599     return s;
1600 }
1601
1602 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1603 {
1604     int s, i;
1605     uint8_t *pix3 = pix2 + line_size;
1606
1607     s = 0;
1608     for(i=0;i<h;i++) {
1609         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1610         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1611         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1612         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1613         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1614         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1615         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1616         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1617         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1618         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1619         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1620         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1621         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1622         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1623         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1624         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1625         pix1 += line_size;
1626         pix2 += line_size;
1627         pix3 += line_size;
1628     }
1629     return s;
1630 }
1631
1632 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1633 {
1634     int s, i;
1635
1636     s = 0;
1637     for(i=0;i<h;i++) {
1638         s += abs(pix1[0] - pix2[0]);
1639         s += abs(pix1[1] - pix2[1]);
1640         s += abs(pix1[2] - pix2[2]);
1641         s += abs(pix1[3] - pix2[3]);
1642         s += abs(pix1[4] - pix2[4]);
1643         s += abs(pix1[5] - pix2[5]);
1644         s += abs(pix1[6] - pix2[6]);
1645         s += abs(pix1[7] - pix2[7]);
1646         pix1 += line_size;
1647         pix2 += line_size;
1648     }
1649     return s;
1650 }
1651
1652 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1653 {
1654     int s, i;
1655
1656     s = 0;
1657     for(i=0;i<h;i++) {
1658         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1659         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1660         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1661         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1662         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1663         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1664         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1665         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1666         pix1 += line_size;
1667         pix2 += line_size;
1668     }
1669     return s;
1670 }
1671
1672 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1673 {
1674     int s, i;
1675     uint8_t *pix3 = pix2 + line_size;
1676
1677     s = 0;
1678     for(i=0;i<h;i++) {
1679         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1680         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1681         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1682         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1683         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1684         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1685         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1686         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1687         pix1 += line_size;
1688         pix2 += line_size;
1689         pix3 += line_size;
1690     }
1691     return s;
1692 }
1693
1694 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1695 {
1696     int s, i;
1697     uint8_t *pix3 = pix2 + line_size;
1698
1699     s = 0;
1700     for(i=0;i<h;i++) {
1701         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1702         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1703         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1704         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1705         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1706         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1707         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1708         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1709         pix1 += line_size;
1710         pix2 += line_size;
1711         pix3 += line_size;
1712     }
1713     return s;
1714 }
1715
1716 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1717     MpegEncContext *c = v;
1718     int score1=0;
1719     int score2=0;
1720     int x,y;
1721
1722     for(y=0; y<h; y++){
1723         for(x=0; x<16; x++){
1724             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1725         }
1726         if(y+1<h){
1727             for(x=0; x<15; x++){
1728                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1729                              - s1[x+1] + s1[x+1+stride])
1730                         -FFABS(  s2[x  ] - s2[x  +stride]
1731                              - s2[x+1] + s2[x+1+stride]);
1732             }
1733         }
1734         s1+= stride;
1735         s2+= stride;
1736     }
1737
1738     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1739     else  return score1 + FFABS(score2)*8;
1740 }
1741
1742 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1743     MpegEncContext *c = v;
1744     int score1=0;
1745     int score2=0;
1746     int x,y;
1747
1748     for(y=0; y<h; y++){
1749         for(x=0; x<8; x++){
1750             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1751         }
1752         if(y+1<h){
1753             for(x=0; x<7; x++){
1754                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1755                              - s1[x+1] + s1[x+1+stride])
1756                         -FFABS(  s2[x  ] - s2[x  +stride]
1757                              - s2[x+1] + s2[x+1+stride]);
1758             }
1759         }
1760         s1+= stride;
1761         s2+= stride;
1762     }
1763
1764     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1765     else  return score1 + FFABS(score2)*8;
1766 }
1767
1768 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1769     int i;
1770     unsigned int sum=0;
1771
1772     for(i=0; i<8*8; i++){
1773         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1774         int w= weight[i];
1775         b>>= RECON_SHIFT;
1776         av_assert2(-512<b && b<512);
1777
1778         sum += (w*b)*(w*b)>>4;
1779     }
1780     return sum>>2;
1781 }
1782
1783 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1784     int i;
1785
1786     for(i=0; i<8*8; i++){
1787         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1788     }
1789 }
1790
1791 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1792     return 0;
1793 }
1794
1795 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1796     int i;
1797
1798     memset(cmp, 0, sizeof(void*)*6);
1799
1800     for(i=0; i<6; i++){
1801         switch(type&0xFF){
1802         case FF_CMP_SAD:
1803             cmp[i]= c->sad[i];
1804             break;
1805         case FF_CMP_SATD:
1806             cmp[i]= c->hadamard8_diff[i];
1807             break;
1808         case FF_CMP_SSE:
1809             cmp[i]= c->sse[i];
1810             break;
1811         case FF_CMP_DCT:
1812             cmp[i]= c->dct_sad[i];
1813             break;
1814         case FF_CMP_DCT264:
1815             cmp[i]= c->dct264_sad[i];
1816             break;
1817         case FF_CMP_DCTMAX:
1818             cmp[i]= c->dct_max[i];
1819             break;
1820         case FF_CMP_PSNR:
1821             cmp[i]= c->quant_psnr[i];
1822             break;
1823         case FF_CMP_BIT:
1824             cmp[i]= c->bit[i];
1825             break;
1826         case FF_CMP_RD:
1827             cmp[i]= c->rd[i];
1828             break;
1829         case FF_CMP_VSAD:
1830             cmp[i]= c->vsad[i];
1831             break;
1832         case FF_CMP_VSSE:
1833             cmp[i]= c->vsse[i];
1834             break;
1835         case FF_CMP_ZERO:
1836             cmp[i]= zero_cmp;
1837             break;
1838         case FF_CMP_NSSE:
1839             cmp[i]= c->nsse[i];
1840             break;
1841 #if CONFIG_DWT
1842         case FF_CMP_W53:
1843             cmp[i]= c->w53[i];
1844             break;
1845         case FF_CMP_W97:
1846             cmp[i]= c->w97[i];
1847             break;
1848 #endif
1849         default:
1850             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1851         }
1852     }
1853 }
1854
1855 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1856     long i;
1857     for (i = 0; i <= w - (int)sizeof(long); i += sizeof(long)) {
1858         long a = *(long*)(src+i);
1859         long b = *(long*)(dst+i);
1860         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1861     }
1862     for(; i<w; i++)
1863         dst[i+0] += src[i+0];
1864 }
1865
1866 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1867     long i;
1868 #if !HAVE_FAST_UNALIGNED
1869     if((long)src2 & (sizeof(long)-1)){
1870         for(i=0; i+7<w; i+=8){
1871             dst[i+0] = src1[i+0]-src2[i+0];
1872             dst[i+1] = src1[i+1]-src2[i+1];
1873             dst[i+2] = src1[i+2]-src2[i+2];
1874             dst[i+3] = src1[i+3]-src2[i+3];
1875             dst[i+4] = src1[i+4]-src2[i+4];
1876             dst[i+5] = src1[i+5]-src2[i+5];
1877             dst[i+6] = src1[i+6]-src2[i+6];
1878             dst[i+7] = src1[i+7]-src2[i+7];
1879         }
1880     }else
1881 #endif
1882     for (i = 0; i <= w - (int)sizeof(long); i += sizeof(long)) {
1883         long a = *(long*)(src1+i);
1884         long b = *(long*)(src2+i);
1885         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1886     }
1887     for(; i<w; i++)
1888         dst[i+0] = src1[i+0]-src2[i+0];
1889 }
1890
1891 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1892     int i;
1893     uint8_t l, lt;
1894
1895     l= *left;
1896     lt= *left_top;
1897
1898     for(i=0; i<w; i++){
1899         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1900         lt= src1[i];
1901         dst[i]= l;
1902     }
1903
1904     *left= l;
1905     *left_top= lt;
1906 }
1907
1908 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1909     int i;
1910     uint8_t l, lt;
1911
1912     l= *left;
1913     lt= *left_top;
1914
1915     for(i=0; i<w; i++){
1916         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1917         lt= src1[i];
1918         l= src2[i];
1919         dst[i]= l - pred;
1920     }
1921
1922     *left= l;
1923     *left_top= lt;
1924 }
1925
1926 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1927     int i;
1928
1929     for(i=0; i<w-1; i++){
1930         acc+= src[i];
1931         dst[i]= acc;
1932         i++;
1933         acc+= src[i];
1934         dst[i]= acc;
1935     }
1936
1937     for(; i<w; i++){
1938         acc+= src[i];
1939         dst[i]= acc;
1940     }
1941
1942     return acc;
1943 }
1944
1945 #if HAVE_BIGENDIAN
1946 #define B 3
1947 #define G 2
1948 #define R 1
1949 #define A 0
1950 #else
1951 #define B 0
1952 #define G 1
1953 #define R 2
1954 #define A 3
1955 #endif
1956 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1957     int i;
1958     int r,g,b,a;
1959     r= *red;
1960     g= *green;
1961     b= *blue;
1962     a= *alpha;
1963
1964     for(i=0; i<w; i++){
1965         b+= src[4*i+B];
1966         g+= src[4*i+G];
1967         r+= src[4*i+R];
1968         a+= src[4*i+A];
1969
1970         dst[4*i+B]= b;
1971         dst[4*i+G]= g;
1972         dst[4*i+R]= r;
1973         dst[4*i+A]= a;
1974     }
1975
1976     *red= r;
1977     *green= g;
1978     *blue= b;
1979     *alpha= a;
1980 }
1981 #undef B
1982 #undef G
1983 #undef R
1984 #undef A
1985
1986 #define BUTTERFLY2(o1,o2,i1,i2) \
1987 o1= (i1)+(i2);\
1988 o2= (i1)-(i2);
1989
1990 #define BUTTERFLY1(x,y) \
1991 {\
1992     int a,b;\
1993     a= x;\
1994     b= y;\
1995     x= a+b;\
1996     y= a-b;\
1997 }
1998
1999 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2000
2001 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2002     int i;
2003     int temp[64];
2004     int sum=0;
2005
2006     av_assert2(h==8);
2007
2008     for(i=0; i<8; i++){
2009         //FIXME try pointer walks
2010         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2011         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2012         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2013         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2014
2015         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2016         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2017         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2018         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2019
2020         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2021         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2022         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2023         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2024     }
2025
2026     for(i=0; i<8; i++){
2027         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2028         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2029         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2030         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2031
2032         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2033         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2034         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2035         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2036
2037         sum +=
2038              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2039             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2040             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2041             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2042     }
2043     return sum;
2044 }
2045
2046 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2047     int i;
2048     int temp[64];
2049     int sum=0;
2050
2051     av_assert2(h==8);
2052
2053     for(i=0; i<8; i++){
2054         //FIXME try pointer walks
2055         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2056         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2057         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2058         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2059
2060         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2061         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2062         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2063         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2064
2065         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2066         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2067         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2068         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2069     }
2070
2071     for(i=0; i<8; i++){
2072         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2073         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2074         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2075         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2076
2077         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2078         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2079         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2080         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2081
2082         sum +=
2083              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2084             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2085             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2086             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2087     }
2088
2089     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2090
2091     return sum;
2092 }
2093
2094 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2095     MpegEncContext * const s= (MpegEncContext *)c;
2096     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2097
2098     av_assert2(h==8);
2099
2100     s->dsp.diff_pixels(temp, src1, src2, stride);
2101     s->dsp.fdct(temp);
2102     return s->dsp.sum_abs_dctelem(temp);
2103 }
2104
2105 #if CONFIG_GPL
2106 #define DCT8_1D {\
2107     const int s07 = SRC(0) + SRC(7);\
2108     const int s16 = SRC(1) + SRC(6);\
2109     const int s25 = SRC(2) + SRC(5);\
2110     const int s34 = SRC(3) + SRC(4);\
2111     const int a0 = s07 + s34;\
2112     const int a1 = s16 + s25;\
2113     const int a2 = s07 - s34;\
2114     const int a3 = s16 - s25;\
2115     const int d07 = SRC(0) - SRC(7);\
2116     const int d16 = SRC(1) - SRC(6);\
2117     const int d25 = SRC(2) - SRC(5);\
2118     const int d34 = SRC(3) - SRC(4);\
2119     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2120     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2121     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2122     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2123     DST(0,  a0 + a1     ) ;\
2124     DST(1,  a4 + (a7>>2)) ;\
2125     DST(2,  a2 + (a3>>1)) ;\
2126     DST(3,  a5 + (a6>>2)) ;\
2127     DST(4,  a0 - a1     ) ;\
2128     DST(5,  a6 - (a5>>2)) ;\
2129     DST(6, (a2>>1) - a3 ) ;\
2130     DST(7, (a4>>2) - a7 ) ;\
2131 }
2132
2133 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2134     MpegEncContext * const s= (MpegEncContext *)c;
2135     int16_t dct[8][8];
2136     int i;
2137     int sum=0;
2138
2139     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2140
2141 #define SRC(x) dct[i][x]
2142 #define DST(x,v) dct[i][x]= v
2143     for( i = 0; i < 8; i++ )
2144         DCT8_1D
2145 #undef SRC
2146 #undef DST
2147
2148 #define SRC(x) dct[x][i]
2149 #define DST(x,v) sum += FFABS(v)
2150     for( i = 0; i < 8; i++ )
2151         DCT8_1D
2152 #undef SRC
2153 #undef DST
2154     return sum;
2155 }
2156 #endif
2157
2158 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2159     MpegEncContext * const s= (MpegEncContext *)c;
2160     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2161     int sum=0, i;
2162
2163     av_assert2(h==8);
2164
2165     s->dsp.diff_pixels(temp, src1, src2, stride);
2166     s->dsp.fdct(temp);
2167
2168     for(i=0; i<64; i++)
2169         sum= FFMAX(sum, FFABS(temp[i]));
2170
2171     return sum;
2172 }
2173
2174 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2175     MpegEncContext * const s= (MpegEncContext *)c;
2176     LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2177     int16_t * const bak = temp+64;
2178     int sum=0, i;
2179
2180     av_assert2(h==8);
2181     s->mb_intra=0;
2182
2183     s->dsp.diff_pixels(temp, src1, src2, stride);
2184
2185     memcpy(bak, temp, 64*sizeof(int16_t));
2186
2187     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2188     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2189     ff_simple_idct_8(temp); //FIXME
2190
2191     for(i=0; i<64; i++)
2192         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2193
2194     return sum;
2195 }
2196
2197 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2198     MpegEncContext * const s= (MpegEncContext *)c;
2199     const uint8_t *scantable= s->intra_scantable.permutated;
2200     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2201     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2202     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2203     int i, last, run, bits, level, distortion, start_i;
2204     const int esc_length= s->ac_esc_length;
2205     uint8_t * length;
2206     uint8_t * last_length;
2207
2208     av_assert2(h==8);
2209
2210     copy_block8(lsrc1, src1, 8, stride, 8);
2211     copy_block8(lsrc2, src2, 8, stride, 8);
2212
2213     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2214
2215     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2216
2217     bits=0;
2218
2219     if (s->mb_intra) {
2220         start_i = 1;
2221         length     = s->intra_ac_vlc_length;
2222         last_length= s->intra_ac_vlc_last_length;
2223         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2224     } else {
2225         start_i = 0;
2226         length     = s->inter_ac_vlc_length;
2227         last_length= s->inter_ac_vlc_last_length;
2228     }
2229
2230     if(last>=start_i){
2231         run=0;
2232         for(i=start_i; i<last; i++){
2233             int j= scantable[i];
2234             level= temp[j];
2235
2236             if(level){
2237                 level+=64;
2238                 if((level&(~127)) == 0){
2239                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2240                 }else
2241                     bits+= esc_length;
2242                 run=0;
2243             }else
2244                 run++;
2245         }
2246         i= scantable[last];
2247
2248         level= temp[i] + 64;
2249
2250         av_assert2(level - 64);
2251
2252         if((level&(~127)) == 0){
2253             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2254         }else
2255             bits+= esc_length;
2256
2257     }
2258
2259     if(last>=0){
2260         if(s->mb_intra)
2261             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2262         else
2263             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2264     }
2265
2266     s->dsp.idct_add(lsrc2, 8, temp);
2267
2268     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2269
2270     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2271 }
2272
2273 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2274     MpegEncContext * const s= (MpegEncContext *)c;
2275     const uint8_t *scantable= s->intra_scantable.permutated;
2276     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2277     int i, last, run, bits, level, start_i;
2278     const int esc_length= s->ac_esc_length;
2279     uint8_t * length;
2280     uint8_t * last_length;
2281
2282     av_assert2(h==8);
2283
2284     s->dsp.diff_pixels(temp, src1, src2, stride);
2285
2286     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2287
2288     bits=0;
2289
2290     if (s->mb_intra) {
2291         start_i = 1;
2292         length     = s->intra_ac_vlc_length;
2293         last_length= s->intra_ac_vlc_last_length;
2294         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2295     } else {
2296         start_i = 0;
2297         length     = s->inter_ac_vlc_length;
2298         last_length= s->inter_ac_vlc_last_length;
2299     }
2300
2301     if(last>=start_i){
2302         run=0;
2303         for(i=start_i; i<last; i++){
2304             int j= scantable[i];
2305             level= temp[j];
2306
2307             if(level){
2308                 level+=64;
2309                 if((level&(~127)) == 0){
2310                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2311                 }else
2312                     bits+= esc_length;
2313                 run=0;
2314             }else
2315                 run++;
2316         }
2317         i= scantable[last];
2318
2319         level= temp[i] + 64;
2320
2321         av_assert2(level - 64);
2322
2323         if((level&(~127)) == 0){
2324             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2325         }else
2326             bits+= esc_length;
2327     }
2328
2329     return bits;
2330 }
2331
2332 #define VSAD_INTRA(size) \
2333 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2334     int score=0;                                                                                            \
2335     int x,y;                                                                                                \
2336                                                                                                             \
2337     for(y=1; y<h; y++){                                                                                     \
2338         for(x=0; x<size; x+=4){                                                                             \
2339             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2340                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2341         }                                                                                                   \
2342         s+= stride;                                                                                         \
2343     }                                                                                                       \
2344                                                                                                             \
2345     return score;                                                                                           \
2346 }
2347 VSAD_INTRA(8)
2348 VSAD_INTRA(16)
2349
2350 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2351     int score=0;
2352     int x,y;
2353
2354     for(y=1; y<h; y++){
2355         for(x=0; x<16; x++){
2356             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2357         }
2358         s1+= stride;
2359         s2+= stride;
2360     }
2361
2362     return score;
2363 }
2364
2365 #define SQ(a) ((a)*(a))
2366 #define VSSE_INTRA(size) \
2367 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2368     int score=0;                                                                                            \
2369     int x,y;                                                                                                \
2370                                                                                                             \
2371     for(y=1; y<h; y++){                                                                                     \
2372         for(x=0; x<size; x+=4){                                                                               \
2373             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2374                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2375         }                                                                                                   \
2376         s+= stride;                                                                                         \
2377     }                                                                                                       \
2378                                                                                                             \
2379     return score;                                                                                           \
2380 }
2381 VSSE_INTRA(8)
2382 VSSE_INTRA(16)
2383
2384 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2385     int score=0;
2386     int x,y;
2387
2388     for(y=1; y<h; y++){
2389         for(x=0; x<16; x++){
2390             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2391         }
2392         s1+= stride;
2393         s2+= stride;
2394     }
2395
2396     return score;
2397 }
2398
2399 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2400                                int size){
2401     int score=0;
2402     int i;
2403     for(i=0; i<size; i++)
2404         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2405     return score;
2406 }
2407
2408 #define WRAPPER8_16_SQ(name8, name16)\
2409 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2410     int score=0;\
2411     score +=name8(s, dst           , src           , stride, 8);\
2412     score +=name8(s, dst+8         , src+8         , stride, 8);\
2413     if(h==16){\
2414         dst += 8*stride;\
2415         src += 8*stride;\
2416         score +=name8(s, dst           , src           , stride, 8);\
2417         score +=name8(s, dst+8         , src+8         , stride, 8);\
2418     }\
2419     return score;\
2420 }
2421
2422 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2423 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2424 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2425 #if CONFIG_GPL
2426 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2427 #endif
2428 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2429 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2430 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2431 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2432
2433 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2434                    uint32_t maxi, uint32_t maxisign)
2435 {
2436
2437     if(a > mini) return mini;
2438     else if((a^(1U<<31)) > maxisign) return maxi;
2439     else return a;
2440 }
2441
2442 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2443     int i;
2444     uint32_t mini = *(uint32_t*)min;
2445     uint32_t maxi = *(uint32_t*)max;
2446     uint32_t maxisign = maxi ^ (1U<<31);
2447     uint32_t *dsti = (uint32_t*)dst;
2448     const uint32_t *srci = (const uint32_t*)src;
2449     for(i=0; i<len; i+=8) {
2450         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2451         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2452         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2453         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2454         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2455         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2456         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2457         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2458     }
2459 }
2460 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2461     int i;
2462     if(min < 0 && max > 0) {
2463         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2464     } else {
2465         for(i=0; i < len; i+=8) {
2466             dst[i    ] = av_clipf(src[i    ], min, max);
2467             dst[i + 1] = av_clipf(src[i + 1], min, max);
2468             dst[i + 2] = av_clipf(src[i + 2], min, max);
2469             dst[i + 3] = av_clipf(src[i + 3], min, max);
2470             dst[i + 4] = av_clipf(src[i + 4], min, max);
2471             dst[i + 5] = av_clipf(src[i + 5], min, max);
2472             dst[i + 6] = av_clipf(src[i + 6], min, max);
2473             dst[i + 7] = av_clipf(src[i + 7], min, max);
2474         }
2475     }
2476 }
2477
2478 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2479 {
2480     int res = 0;
2481
2482     while (order--)
2483         res += *v1++ * *v2++;
2484
2485     return res;
2486 }
2487
2488 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2489 {
2490     int res = 0;
2491     while (order--) {
2492         res   += *v1 * *v2++;
2493         *v1++ += mul * *v3++;
2494     }
2495     return res;
2496 }
2497
2498 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2499                                 int32_t max, unsigned int len)
2500 {
2501     do {
2502         *dst++ = av_clip(*src++, min, max);
2503         *dst++ = av_clip(*src++, min, max);
2504         *dst++ = av_clip(*src++, min, max);
2505         *dst++ = av_clip(*src++, min, max);
2506         *dst++ = av_clip(*src++, min, max);
2507         *dst++ = av_clip(*src++, min, max);
2508         *dst++ = av_clip(*src++, min, max);
2509         *dst++ = av_clip(*src++, min, max);
2510         len -= 8;
2511     } while (len > 0);
2512 }
2513
2514 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2515 {
2516     ff_j_rev_dct (block);
2517     put_pixels_clamped_c(block, dest, line_size);
2518 }
2519 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2520 {
2521     ff_j_rev_dct (block);
2522     add_pixels_clamped_c(block, dest, line_size);
2523 }
2524
2525 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2526 {
2527     ff_j_rev_dct4 (block);
2528     put_pixels_clamped4_c(block, dest, line_size);
2529 }
2530 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2531 {
2532     ff_j_rev_dct4 (block);
2533     add_pixels_clamped4_c(block, dest, line_size);
2534 }
2535
2536 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2537 {
2538     ff_j_rev_dct2 (block);
2539     put_pixels_clamped2_c(block, dest, line_size);
2540 }
2541 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2542 {
2543     ff_j_rev_dct2 (block);
2544     add_pixels_clamped2_c(block, dest, line_size);
2545 }
2546
2547 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2548 {
2549     dest[0] = av_clip_uint8((block[0] + 4)>>3);
2550 }
2551 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2552 {
2553     dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2554 }
2555
2556 /* init static data */
2557 av_cold void ff_dsputil_static_init(void)
2558 {
2559     int i;
2560
2561     for(i=0;i<512;i++) {
2562         ff_squareTbl[i] = (i - 256) * (i - 256);
2563     }
2564 }
2565
2566 int ff_check_alignment(void){
2567     static int did_fail=0;
2568     LOCAL_ALIGNED_16(int, aligned, [4]);
2569
2570     if((intptr_t)aligned & 15){
2571         if(!did_fail){
2572 #if HAVE_MMX || HAVE_ALTIVEC
2573             av_log(NULL, AV_LOG_ERROR,
2574                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2575                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2576                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2577                 "Do not report crashes to FFmpeg developers.\n");
2578 #endif
2579             did_fail=1;
2580         }
2581         return -1;
2582     }
2583     return 0;
2584 }
2585
2586 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2587 {
2588     ff_check_alignment();
2589
2590 #if CONFIG_ENCODERS
2591     if (avctx->bits_per_raw_sample == 10) {
2592         c->fdct    = ff_jpeg_fdct_islow_10;
2593         c->fdct248 = ff_fdct248_islow_10;
2594     } else {
2595         if(avctx->dct_algo==FF_DCT_FASTINT) {
2596             c->fdct    = ff_fdct_ifast;
2597             c->fdct248 = ff_fdct_ifast248;
2598         }
2599         else if(avctx->dct_algo==FF_DCT_FAAN) {
2600             c->fdct    = ff_faandct;
2601             c->fdct248 = ff_faandct248;
2602         }
2603         else {
2604             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
2605             c->fdct248 = ff_fdct248_islow_8;
2606         }
2607     }
2608 #endif //CONFIG_ENCODERS
2609
2610     if(avctx->lowres==1){
2611         c->idct_put= ff_jref_idct4_put;
2612         c->idct_add= ff_jref_idct4_add;
2613         c->idct    = ff_j_rev_dct4;
2614         c->idct_permutation_type= FF_NO_IDCT_PERM;
2615     }else if(avctx->lowres==2){
2616         c->idct_put= ff_jref_idct2_put;
2617         c->idct_add= ff_jref_idct2_add;
2618         c->idct    = ff_j_rev_dct2;
2619         c->idct_permutation_type= FF_NO_IDCT_PERM;
2620     }else if(avctx->lowres==3){
2621         c->idct_put= ff_jref_idct1_put;
2622         c->idct_add= ff_jref_idct1_add;
2623         c->idct    = ff_j_rev_dct1;
2624         c->idct_permutation_type= FF_NO_IDCT_PERM;
2625     }else{
2626         if (avctx->bits_per_raw_sample == 10) {
2627             c->idct_put              = ff_simple_idct_put_10;
2628             c->idct_add              = ff_simple_idct_add_10;
2629             c->idct                  = ff_simple_idct_10;
2630             c->idct_permutation_type = FF_NO_IDCT_PERM;
2631         } else if (avctx->bits_per_raw_sample == 12) {
2632             c->idct_put              = ff_simple_idct_put_12;
2633             c->idct_add              = ff_simple_idct_add_12;
2634             c->idct                  = ff_simple_idct_12;
2635             c->idct_permutation_type = FF_NO_IDCT_PERM;
2636         } else {
2637         if(avctx->idct_algo==FF_IDCT_INT){
2638             c->idct_put= jref_idct_put;
2639             c->idct_add= jref_idct_add;
2640             c->idct    = ff_j_rev_dct;
2641             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2642         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2643             c->idct_put= ff_faanidct_put;
2644             c->idct_add= ff_faanidct_add;
2645             c->idct    = ff_faanidct;
2646             c->idct_permutation_type= FF_NO_IDCT_PERM;
2647         }else{ //accurate/default
2648             c->idct_put = ff_simple_idct_put_8;
2649             c->idct_add = ff_simple_idct_add_8;
2650             c->idct     = ff_simple_idct_8;
2651             c->idct_permutation_type= FF_NO_IDCT_PERM;
2652         }
2653         }
2654     }
2655
2656     c->diff_pixels = diff_pixels_c;
2657     c->put_pixels_clamped = put_pixels_clamped_c;
2658     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2659     c->add_pixels_clamped = add_pixels_clamped_c;
2660     c->sum_abs_dctelem = sum_abs_dctelem_c;
2661     c->gmc1 = gmc1_c;
2662     c->gmc = ff_gmc_c;
2663     c->pix_sum = pix_sum_c;
2664     c->pix_norm1 = pix_norm1_c;
2665
2666     c->fill_block_tab[0] = fill_block16_c;
2667     c->fill_block_tab[1] = fill_block8_c;
2668
2669     /* TODO [0] 16  [1] 8 */
2670     c->pix_abs[0][0] = pix_abs16_c;
2671     c->pix_abs[0][1] = pix_abs16_x2_c;
2672     c->pix_abs[0][2] = pix_abs16_y2_c;
2673     c->pix_abs[0][3] = pix_abs16_xy2_c;
2674     c->pix_abs[1][0] = pix_abs8_c;
2675     c->pix_abs[1][1] = pix_abs8_x2_c;
2676     c->pix_abs[1][2] = pix_abs8_y2_c;
2677     c->pix_abs[1][3] = pix_abs8_xy2_c;
2678
2679     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2680     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2681     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2682     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2683     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2684     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2685     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2686     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2687     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2688
2689     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2690     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2691     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2692     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2693     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2694     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2695     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2696     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2697     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2698
2699 #define dspfunc(PFX, IDX, NUM) \
2700     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2701     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2702     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2703     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2704     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2705     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2706     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2707     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2708     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2709     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2710     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2711     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2712     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2713     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2714     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2715     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2716
2717     dspfunc(put_qpel, 0, 16);
2718     dspfunc(put_no_rnd_qpel, 0, 16);
2719
2720     dspfunc(avg_qpel, 0, 16);
2721     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2722
2723     dspfunc(put_qpel, 1, 8);
2724     dspfunc(put_no_rnd_qpel, 1, 8);
2725
2726     dspfunc(avg_qpel, 1, 8);
2727     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2728
2729 #undef dspfunc
2730
2731     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2732     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2733     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2734     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2735     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2736     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2737     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2738     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2739
2740 #define SET_CMP_FUNC(name) \
2741     c->name[0]= name ## 16_c;\
2742     c->name[1]= name ## 8x8_c;
2743
2744     SET_CMP_FUNC(hadamard8_diff)
2745     c->hadamard8_diff[4]= hadamard8_intra16_c;
2746     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2747     SET_CMP_FUNC(dct_sad)
2748     SET_CMP_FUNC(dct_max)
2749 #if CONFIG_GPL
2750     SET_CMP_FUNC(dct264_sad)
2751 #endif
2752     c->sad[0]= pix_abs16_c;
2753     c->sad[1]= pix_abs8_c;
2754     c->sse[0]= sse16_c;
2755     c->sse[1]= sse8_c;
2756     c->sse[2]= sse4_c;
2757     SET_CMP_FUNC(quant_psnr)
2758     SET_CMP_FUNC(rd)
2759     SET_CMP_FUNC(bit)
2760     c->vsad[0]= vsad16_c;
2761     c->vsad[4]= vsad_intra16_c;
2762     c->vsad[5]= vsad_intra8_c;
2763     c->vsse[0]= vsse16_c;
2764     c->vsse[4]= vsse_intra16_c;
2765     c->vsse[5]= vsse_intra8_c;
2766     c->nsse[0]= nsse16_c;
2767     c->nsse[1]= nsse8_c;
2768 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
2769     ff_dsputil_init_dwt(c);
2770 #endif
2771
2772     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2773
2774     c->add_bytes= add_bytes_c;
2775     c->diff_bytes= diff_bytes_c;
2776     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2777     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2778     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
2779     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2780     c->bswap_buf= bswap_buf;
2781     c->bswap16_buf = bswap16_buf;
2782
2783     c->try_8x8basis= try_8x8basis_c;
2784     c->add_8x8basis= add_8x8basis_c;
2785
2786     c->vector_clipf = vector_clipf_c;
2787     c->scalarproduct_int16 = scalarproduct_int16_c;
2788     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2789     c->vector_clip_int32 = vector_clip_int32_c;
2790
2791     c->shrink[0]= av_image_copy_plane;
2792     c->shrink[1]= ff_shrink22;
2793     c->shrink[2]= ff_shrink44;
2794     c->shrink[3]= ff_shrink88;
2795
2796     c->add_pixels8 = add_pixels8_c;
2797
2798 #undef FUNC
2799 #undef FUNCC
2800 #define FUNC(f, depth) f ## _ ## depth
2801 #define FUNCC(f, depth) f ## _ ## depth ## _c
2802
2803     c->draw_edges                    = FUNCC(draw_edges, 8);
2804     c->clear_block                   = FUNCC(clear_block, 8);
2805     c->clear_blocks                  = FUNCC(clear_blocks, 8);
2806
2807 #define BIT_DEPTH_FUNCS(depth) \
2808     c->get_pixels                    = FUNCC(get_pixels,   depth);
2809
2810     switch (avctx->bits_per_raw_sample) {
2811     case 9:
2812     case 10:
2813     case 12:
2814     case 14:
2815         BIT_DEPTH_FUNCS(16);
2816         break;
2817     default:
2818         if(avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
2819             BIT_DEPTH_FUNCS(8);
2820         }
2821         break;
2822     }
2823
2824
2825     if (ARCH_ALPHA)
2826         ff_dsputil_init_alpha(c, avctx);
2827     if (ARCH_ARM)
2828         ff_dsputil_init_arm(c, avctx);
2829     if (ARCH_BFIN)
2830         ff_dsputil_init_bfin(c, avctx);
2831     if (ARCH_PPC)
2832         ff_dsputil_init_ppc(c, avctx);
2833     if (ARCH_SH4)
2834         ff_dsputil_init_sh4(c, avctx);
2835     if (HAVE_VIS)
2836         ff_dsputil_init_vis(c, avctx);
2837     if (ARCH_X86)
2838         ff_dsputil_init_x86(c, avctx);
2839
2840     ff_init_scantable_permutation(c->idct_permutation,
2841                                   c->idct_permutation_type);
2842 }
2843
2844 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2845 {
2846     ff_dsputil_init(c, avctx);
2847 }
2848
2849 av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2850 {
2851     ff_dsputil_init(c, avctx);
2852 }