git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41
  42 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  43 uint32_t ff_squareTbl[512] = {0, };
  44
  45 #define BIT_DEPTH 9
  46 #include "dsputil_template.c"
  47 #undef BIT_DEPTH
  48
  49 #define BIT_DEPTH 10
  50 #include "dsputil_template.c"
  51 #undef BIT_DEPTH
  52
  53 #define BIT_DEPTH 8
  54 #include "dsputil_template.c"
  55
  56 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  57 #define pb_7f (~0UL/255 * 0x7f)
  58 #define pb_80 (~0UL/255 * 0x80)
  59
  60 const uint8_t ff_zigzag_direct[64] = {
  61     0,   1,  8, 16,  9,  2,  3, 10,
  62     17, 24, 32, 25, 18, 11,  4,  5,
  63     12, 19, 26, 33, 40, 48, 41, 34,
  64     27, 20, 13,  6,  7, 14, 21, 28,
  65     35, 42, 49, 56, 57, 50, 43, 36,
  66     29, 22, 15, 23, 30, 37, 44, 51,
  67     58, 59, 52, 45, 38, 31, 39, 46,
  68     53, 60, 61, 54, 47, 55, 62, 63
  69 };
  70
  71 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  72    specification, we interleave the fields */
  73 const uint8_t ff_zigzag248_direct[64] = {
  74      0,  8,  1,  9, 16, 24,  2, 10,
  75     17, 25, 32, 40, 48, 56, 33, 41,
  76     18, 26,  3, 11,  4, 12, 19, 27,
  77     34, 42, 49, 57, 50, 58, 35, 43,
  78     20, 28,  5, 13,  6, 14, 21, 29,
  79     36, 44, 51, 59, 52, 60, 37, 45,
  80     22, 30,  7, 15, 23, 31, 38, 46,
  81     53, 61, 54, 62, 39, 47, 55, 63,
  82 };
  83
  84 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  85 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  86
  87 const uint8_t ff_alternate_horizontal_scan[64] = {
  88     0,  1,   2,  3,  8,  9, 16, 17,
  89     10, 11,  4,  5,  6,  7, 15, 14,
  90     13, 12, 19, 18, 24, 25, 32, 33,
  91     26, 27, 20, 21, 22, 23, 28, 29,
  92     30, 31, 34, 35, 40, 41, 48, 49,
  93     42, 43, 36, 37, 38, 39, 44, 45,
  94     46, 47, 50, 51, 56, 57, 58, 59,
  95     52, 53, 54, 55, 60, 61, 62, 63,
  96 };
  97
  98 const uint8_t ff_alternate_vertical_scan[64] = {
  99     0,  8,  16, 24,  1,  9,  2, 10,
 100     17, 25, 32, 40, 48, 56, 57, 49,
 101     41, 33, 26, 18,  3, 11,  4, 12,
 102     19, 27, 34, 42, 50, 58, 35, 43,
 103     51, 59, 20, 28,  5, 13,  6, 14,
 104     21, 29, 36, 44, 52, 60, 37, 45,
 105     53, 61, 22, 30,  7, 15, 23, 31,
 106     38, 46, 54, 62, 39, 47, 55, 63,
 107 };
 108
 109 /* Input permutation for the simple_idct_mmx */
 110 static const uint8_t simple_mmx_permutation[64]={
 111         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 112         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 113         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 114         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 115         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 116         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 117         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 118         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 119 };
 120
 121 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 122
 123 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 124     int i;
 125     int end;
 126
 127     st->scantable= src_scantable;
 128
 129     for(i=0; i<64; i++){
 130         int j;
 131         j = src_scantable[i];
 132         st->permutated[i] = permutation[j];
 133 #if ARCH_PPC
 134         st->inverse[j] = i;
 135 #endif
 136     }
 137
 138     end=-1;
 139     for(i=0; i<64; i++){
 140         int j;
 141         j = st->permutated[i];
 142         if(j>end) end=j;
 143         st->raster_end[i]= end;
 144     }
 145 }
 146
 147 static int pix_sum_c(uint8_t * pix, int line_size)
 148 {
 149     int s, i, j;
 150
 151     s = 0;
 152     for (i = 0; i < 16; i++) {
 153         for (j = 0; j < 16; j += 8) {
 154             s += pix[0];
 155             s += pix[1];
 156             s += pix[2];
 157             s += pix[3];
 158             s += pix[4];
 159             s += pix[5];
 160             s += pix[6];
 161             s += pix[7];
 162             pix += 8;
 163         }
 164         pix += line_size - 16;
 165     }
 166     return s;
 167 }
 168
 169 static int pix_norm1_c(uint8_t * pix, int line_size)
 170 {
 171     int s, i, j;
 172     uint32_t *sq = ff_squareTbl + 256;
 173
 174     s = 0;
 175     for (i = 0; i < 16; i++) {
 176         for (j = 0; j < 16; j += 8) {
 177 #if 0
 178             s += sq[pix[0]];
 179             s += sq[pix[1]];
 180             s += sq[pix[2]];
 181             s += sq[pix[3]];
 182             s += sq[pix[4]];
 183             s += sq[pix[5]];
 184             s += sq[pix[6]];
 185             s += sq[pix[7]];
 186 #else
 187 #if HAVE_FAST_64BIT
 188             register uint64_t x=*(uint64_t*)pix;
 189             s += sq[x&0xff];
 190             s += sq[(x>>8)&0xff];
 191             s += sq[(x>>16)&0xff];
 192             s += sq[(x>>24)&0xff];
 193             s += sq[(x>>32)&0xff];
 194             s += sq[(x>>40)&0xff];
 195             s += sq[(x>>48)&0xff];
 196             s += sq[(x>>56)&0xff];
 197 #else
 198             register uint32_t x=*(uint32_t*)pix;
 199             s += sq[x&0xff];
 200             s += sq[(x>>8)&0xff];
 201             s += sq[(x>>16)&0xff];
 202             s += sq[(x>>24)&0xff];
 203             x=*(uint32_t*)(pix+4);
 204             s += sq[x&0xff];
 205             s += sq[(x>>8)&0xff];
 206             s += sq[(x>>16)&0xff];
 207             s += sq[(x>>24)&0xff];
 208 #endif
 209 #endif
 210             pix += 8;
 211         }
 212         pix += line_size - 16;
 213     }
 214     return s;
 215 }
 216
 217 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 218     int i;
 219
 220     for(i=0; i+8<=w; i+=8){
 221         dst[i+0]= av_bswap32(src[i+0]);
 222         dst[i+1]= av_bswap32(src[i+1]);
 223         dst[i+2]= av_bswap32(src[i+2]);
 224         dst[i+3]= av_bswap32(src[i+3]);
 225         dst[i+4]= av_bswap32(src[i+4]);
 226         dst[i+5]= av_bswap32(src[i+5]);
 227         dst[i+6]= av_bswap32(src[i+6]);
 228         dst[i+7]= av_bswap32(src[i+7]);
 229     }
 230     for(;i<w; i++){
 231         dst[i+0]= av_bswap32(src[i+0]);
 232     }
 233 }
 234
 235 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 236 {
 237     while (len--)
 238         *dst++ = av_bswap16(*src++);
 239 }
 240
 241 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 242 {
 243     int s, i;
 244     uint32_t *sq = ff_squareTbl + 256;
 245
 246     s = 0;
 247     for (i = 0; i < h; i++) {
 248         s += sq[pix1[0] - pix2[0]];
 249         s += sq[pix1[1] - pix2[1]];
 250         s += sq[pix1[2] - pix2[2]];
 251         s += sq[pix1[3] - pix2[3]];
 252         pix1 += line_size;
 253         pix2 += line_size;
 254     }
 255     return s;
 256 }
 257
 258 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 259 {
 260     int s, i;
 261     uint32_t *sq = ff_squareTbl + 256;
 262
 263     s = 0;
 264     for (i = 0; i < h; i++) {
 265         s += sq[pix1[0] - pix2[0]];
 266         s += sq[pix1[1] - pix2[1]];
 267         s += sq[pix1[2] - pix2[2]];
 268         s += sq[pix1[3] - pix2[3]];
 269         s += sq[pix1[4] - pix2[4]];
 270         s += sq[pix1[5] - pix2[5]];
 271         s += sq[pix1[6] - pix2[6]];
 272         s += sq[pix1[7] - pix2[7]];
 273         pix1 += line_size;
 274         pix2 += line_size;
 275     }
 276     return s;
 277 }
 278
 279 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 280 {
 281     int s, i;
 282     uint32_t *sq = ff_squareTbl + 256;
 283
 284     s = 0;
 285     for (i = 0; i < h; i++) {
 286         s += sq[pix1[ 0] - pix2[ 0]];
 287         s += sq[pix1[ 1] - pix2[ 1]];
 288         s += sq[pix1[ 2] - pix2[ 2]];
 289         s += sq[pix1[ 3] - pix2[ 3]];
 290         s += sq[pix1[ 4] - pix2[ 4]];
 291         s += sq[pix1[ 5] - pix2[ 5]];
 292         s += sq[pix1[ 6] - pix2[ 6]];
 293         s += sq[pix1[ 7] - pix2[ 7]];
 294         s += sq[pix1[ 8] - pix2[ 8]];
 295         s += sq[pix1[ 9] - pix2[ 9]];
 296         s += sq[pix1[10] - pix2[10]];
 297         s += sq[pix1[11] - pix2[11]];
 298         s += sq[pix1[12] - pix2[12]];
 299         s += sq[pix1[13] - pix2[13]];
 300         s += sq[pix1[14] - pix2[14]];
 301         s += sq[pix1[15] - pix2[15]];
 302
 303         pix1 += line_size;
 304         pix2 += line_size;
 305     }
 306     return s;
 307 }
 308
 309 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 310 {
 311     int i;
 312
 313     /* read the pixels */
 314     for(i=0;i<8;i++) {
 315         block[0] = pixels[0];
 316         block[1] = pixels[1];
 317         block[2] = pixels[2];
 318         block[3] = pixels[3];
 319         block[4] = pixels[4];
 320         block[5] = pixels[5];
 321         block[6] = pixels[6];
 322         block[7] = pixels[7];
 323         pixels += line_size;
 324         block += 8;
 325     }
 326 }
 327
 328 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 329                           const uint8_t *s2, int stride){
 330     int i;
 331
 332     /* read the pixels */
 333     for(i=0;i<8;i++) {
 334         block[0] = s1[0] - s2[0];
 335         block[1] = s1[1] - s2[1];
 336         block[2] = s1[2] - s2[2];
 337         block[3] = s1[3] - s2[3];
 338         block[4] = s1[4] - s2[4];
 339         block[5] = s1[5] - s2[5];
 340         block[6] = s1[6] - s2[6];
 341         block[7] = s1[7] - s2[7];
 342         s1 += stride;
 343         s2 += stride;
 344         block += 8;
 345     }
 346 }
 347
 348
 349 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 350                              int line_size)
 351 {
 352     int i;
 353     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 354
 355     /* read the pixels */
 356     for(i=0;i<8;i++) {
 357         pixels[0] = cm[block[0]];
 358         pixels[1] = cm[block[1]];
 359         pixels[2] = cm[block[2]];
 360         pixels[3] = cm[block[3]];
 361         pixels[4] = cm[block[4]];
 362         pixels[5] = cm[block[5]];
 363         pixels[6] = cm[block[6]];
 364         pixels[7] = cm[block[7]];
 365
 366         pixels += line_size;
 367         block += 8;
 368     }
 369 }
 370
 371 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 372                                  int line_size)
 373 {
 374     int i;
 375     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 376
 377     /* read the pixels */
 378     for(i=0;i<4;i++) {
 379         pixels[0] = cm[block[0]];
 380         pixels[1] = cm[block[1]];
 381         pixels[2] = cm[block[2]];
 382         pixels[3] = cm[block[3]];
 383
 384         pixels += line_size;
 385         block += 8;
 386     }
 387 }
 388
 389 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 390                                  int line_size)
 391 {
 392     int i;
 393     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 394
 395     /* read the pixels */
 396     for(i=0;i<2;i++) {
 397         pixels[0] = cm[block[0]];
 398         pixels[1] = cm[block[1]];
 399
 400         pixels += line_size;
 401         block += 8;
 402     }
 403 }
 404
 405 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
 406                                     uint8_t *restrict pixels,
 407                                     int line_size)
 408 {
 409     int i, j;
 410
 411     for (i = 0; i < 8; i++) {
 412         for (j = 0; j < 8; j++) {
 413             if (*block < -128)
 414                 *pixels = 0;
 415             else if (*block > 127)
 416                 *pixels = 255;
 417             else
 418                 *pixels = (uint8_t)(*block + 128);
 419             block++;
 420             pixels++;
 421         }
 422         pixels += (line_size - 8);
 423     }
 424 }
 425
 426 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 427                                     int line_size)
 428 {
 429     int i;
 430
 431     /* read the pixels */
 432     for(i=0;i<8;i++) {
 433         pixels[0] = block[0];
 434         pixels[1] = block[1];
 435         pixels[2] = block[2];
 436         pixels[3] = block[3];
 437         pixels[4] = block[4];
 438         pixels[5] = block[5];
 439         pixels[6] = block[6];
 440         pixels[7] = block[7];
 441
 442         pixels += line_size;
 443         block += 8;
 444     }
 445 }
 446
 447 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 448                              int line_size)
 449 {
 450     int i;
 451     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 452
 453     /* read the pixels */
 454     for(i=0;i<8;i++) {
 455         pixels[0] = cm[pixels[0] + block[0]];
 456         pixels[1] = cm[pixels[1] + block[1]];
 457         pixels[2] = cm[pixels[2] + block[2]];
 458         pixels[3] = cm[pixels[3] + block[3]];
 459         pixels[4] = cm[pixels[4] + block[4]];
 460         pixels[5] = cm[pixels[5] + block[5]];
 461         pixels[6] = cm[pixels[6] + block[6]];
 462         pixels[7] = cm[pixels[7] + block[7]];
 463         pixels += line_size;
 464         block += 8;
 465     }
 466 }
 467
 468 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 469                           int line_size)
 470 {
 471     int i;
 472     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 473
 474     /* read the pixels */
 475     for(i=0;i<4;i++) {
 476         pixels[0] = cm[pixels[0] + block[0]];
 477         pixels[1] = cm[pixels[1] + block[1]];
 478         pixels[2] = cm[pixels[2] + block[2]];
 479         pixels[3] = cm[pixels[3] + block[3]];
 480         pixels += line_size;
 481         block += 8;
 482     }
 483 }
 484
 485 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 486                           int line_size)
 487 {
 488     int i;
 489     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 490
 491     /* read the pixels */
 492     for(i=0;i<2;i++) {
 493         pixels[0] = cm[pixels[0] + block[0]];
 494         pixels[1] = cm[pixels[1] + block[1]];
 495         pixels += line_size;
 496         block += 8;
 497     }
 498 }
 499
 500 static int sum_abs_dctelem_c(DCTELEM *block)
 501 {
 502     int sum=0, i;
 503     for(i=0; i<64; i++)
 504         sum+= FFABS(block[i]);
 505     return sum;
 506 }
 507
 508 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 509 {
 510     int i;
 511
 512     for (i = 0; i < h; i++) {
 513         memset(block, value, 16);
 514         block += line_size;
 515     }
 516 }
 517
 518 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 519 {
 520     int i;
 521
 522     for (i = 0; i < h; i++) {
 523         memset(block, value, 8);
 524         block += line_size;
 525     }
 526 }
 527
 528 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
 529 {
 530     int i, j;
 531     uint16_t *dst1 = (uint16_t *) dst;
 532     uint16_t *dst2 = (uint16_t *)(dst + linesize);
 533
 534     for (j = 0; j < 8; j++) {
 535         for (i = 0; i < 8; i++) {
 536             dst1[i] = dst2[i] = src[i] * 0x0101;
 537         }
 538         src  += 8;
 539         dst1 += linesize;
 540         dst2 += linesize;
 541     }
 542 }
 543
 544 #define avg2(a,b) ((a+b+1)>>1)
 545 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 546
 547 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 548 {
 549     const int A=(16-x16)*(16-y16);
 550     const int B=(   x16)*(16-y16);
 551     const int C=(16-x16)*(   y16);
 552     const int D=(   x16)*(   y16);
 553     int i;
 554
 555     for(i=0; i<h; i++)
 556     {
 557         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 558         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 559         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 560         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 561         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 562         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 563         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 564         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 565         dst+= stride;
 566         src+= stride;
 567     }
 568 }
 569
 570 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 571                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 572 {
 573     int y, vx, vy;
 574     const int s= 1<<shift;
 575
 576     width--;
 577     height--;
 578
 579     for(y=0; y<h; y++){
 580         int x;
 581
 582         vx= ox;
 583         vy= oy;
 584         for(x=0; x<8; x++){ //XXX FIXME optimize
 585             int src_x, src_y, frac_x, frac_y, index;
 586
 587             src_x= vx>>16;
 588             src_y= vy>>16;
 589             frac_x= src_x&(s-1);
 590             frac_y= src_y&(s-1);
 591             src_x>>=shift;
 592             src_y>>=shift;
 593
 594             if((unsigned)src_x < width){
 595                 if((unsigned)src_y < height){
 596                     index= src_x + src_y*stride;
 597                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 598                                            + src[index       +1]*   frac_x )*(s-frac_y)
 599                                         + (  src[index+stride  ]*(s-frac_x)
 600                                            + src[index+stride+1]*   frac_x )*   frac_y
 601                                         + r)>>(shift*2);
 602                 }else{
 603                     index= src_x + av_clip(src_y, 0, height)*stride;
 604                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 605                                           + src[index       +1]*   frac_x )*s
 606                                         + r)>>(shift*2);
 607                 }
 608             }else{
 609                 if((unsigned)src_y < height){
 610                     index= av_clip(src_x, 0, width) + src_y*stride;
 611                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 612                                            + src[index+stride  ]*   frac_y )*s
 613                                         + r)>>(shift*2);
 614                 }else{
 615                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 616                     dst[y*stride + x]=    src[index         ];
 617                 }
 618             }
 619
 620             vx+= dxx;
 621             vy+= dyx;
 622         }
 623         ox += dxy;
 624         oy += dyy;
 625     }
 626 }
 627
 628 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 629     switch(width){
 630     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 631     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 632     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 633     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 634     }
 635 }
 636
 637 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 638     int i,j;
 639     for (i=0; i < height; i++) {
 640       for (j=0; j < width; j++) {
 641         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 642       }
 643       src += stride;
 644       dst += stride;
 645     }
 646 }
 647
 648 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 649     int i,j;
 650     for (i=0; i < height; i++) {
 651       for (j=0; j < width; j++) {
 652         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 653       }
 654       src += stride;
 655       dst += stride;
 656     }
 657 }
 658
 659 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 660     int i,j;
 661     for (i=0; i < height; i++) {
 662       for (j=0; j < width; j++) {
 663         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 664       }
 665       src += stride;
 666       dst += stride;
 667     }
 668 }
 669
 670 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 671     int i,j;
 672     for (i=0; i < height; i++) {
 673       for (j=0; j < width; j++) {
 674         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 675       }
 676       src += stride;
 677       dst += stride;
 678     }
 679 }
 680
 681 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 682     int i,j;
 683     for (i=0; i < height; i++) {
 684       for (j=0; j < width; j++) {
 685         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 686       }
 687       src += stride;
 688       dst += stride;
 689     }
 690 }
 691
 692 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 693     int i,j;
 694     for (i=0; i < height; i++) {
 695       for (j=0; j < width; j++) {
 696         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 697       }
 698       src += stride;
 699       dst += stride;
 700     }
 701 }
 702
 703 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 704     int i,j;
 705     for (i=0; i < height; i++) {
 706       for (j=0; j < width; j++) {
 707         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 708       }
 709       src += stride;
 710       dst += stride;
 711     }
 712 }
 713
 714 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 715     int i,j;
 716     for (i=0; i < height; i++) {
 717       for (j=0; j < width; j++) {
 718         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 719       }
 720       src += stride;
 721       dst += stride;
 722     }
 723 }
 724
 725 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 726     switch(width){
 727     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 728     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 729     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 730     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 731     }
 732 }
 733
 734 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 735     int i,j;
 736     for (i=0; i < height; i++) {
 737       for (j=0; j < width; j++) {
 738         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 739       }
 740       src += stride;
 741       dst += stride;
 742     }
 743 }
 744
 745 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 746     int i,j;
 747     for (i=0; i < height; i++) {
 748       for (j=0; j < width; j++) {
 749         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 750       }
 751       src += stride;
 752       dst += stride;
 753     }
 754 }
 755
 756 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 757     int i,j;
 758     for (i=0; i < height; i++) {
 759       for (j=0; j < width; j++) {
 760         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 761       }
 762       src += stride;
 763       dst += stride;
 764     }
 765 }
 766
 767 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 768     int i,j;
 769     for (i=0; i < height; i++) {
 770       for (j=0; j < width; j++) {
 771         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 772       }
 773       src += stride;
 774       dst += stride;
 775     }
 776 }
 777
 778 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 779     int i,j;
 780     for (i=0; i < height; i++) {
 781       for (j=0; j < width; j++) {
 782         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 783       }
 784       src += stride;
 785       dst += stride;
 786     }
 787 }
 788
 789 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 790     int i,j;
 791     for (i=0; i < height; i++) {
 792       for (j=0; j < width; j++) {
 793         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 794       }
 795       src += stride;
 796       dst += stride;
 797     }
 798 }
 799
 800 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 801     int i,j;
 802     for (i=0; i < height; i++) {
 803       for (j=0; j < width; j++) {
 804         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 805       }
 806       src += stride;
 807       dst += stride;
 808     }
 809 }
 810
 811 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 812     int i,j;
 813     for (i=0; i < height; i++) {
 814       for (j=0; j < width; j++) {
 815         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 816       }
 817       src += stride;
 818       dst += stride;
 819     }
 820 }
 821
 822 #define QPEL_MC(r, OPNAME, RND, OP) \
 823 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 824     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 825     int i;\
 826     for(i=0; i<h; i++)\
 827     {\
 828         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 829         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 830         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 831         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 832         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 833         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 834         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 835         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 836         dst+=dstStride;\
 837         src+=srcStride;\
 838     }\
 839 }\
 840 \
 841 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 842     const int w=8;\
 843     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 844     int i;\
 845     for(i=0; i<w; i++)\
 846     {\
 847         const int src0= src[0*srcStride];\
 848         const int src1= src[1*srcStride];\
 849         const int src2= src[2*srcStride];\
 850         const int src3= src[3*srcStride];\
 851         const int src4= src[4*srcStride];\
 852         const int src5= src[5*srcStride];\
 853         const int src6= src[6*srcStride];\
 854         const int src7= src[7*srcStride];\
 855         const int src8= src[8*srcStride];\
 856         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 857         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 858         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 859         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 860         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 861         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 862         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 863         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 864         dst++;\
 865         src++;\
 866     }\
 867 }\
 868 \
 869 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 870     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 871     int i;\
 872     \
 873     for(i=0; i<h; i++)\
 874     {\
 875         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 876         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 877         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 878         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 879         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 880         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 881         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 882         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 883         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 884         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 885         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 886         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 887         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 888         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 889         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 890         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 891         dst+=dstStride;\
 892         src+=srcStride;\
 893     }\
 894 }\
 895 \
 896 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 897     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 898     int i;\
 899     const int w=16;\
 900     for(i=0; i<w; i++)\
 901     {\
 902         const int src0= src[0*srcStride];\
 903         const int src1= src[1*srcStride];\
 904         const int src2= src[2*srcStride];\
 905         const int src3= src[3*srcStride];\
 906         const int src4= src[4*srcStride];\
 907         const int src5= src[5*srcStride];\
 908         const int src6= src[6*srcStride];\
 909         const int src7= src[7*srcStride];\
 910         const int src8= src[8*srcStride];\
 911         const int src9= src[9*srcStride];\
 912         const int src10= src[10*srcStride];\
 913         const int src11= src[11*srcStride];\
 914         const int src12= src[12*srcStride];\
 915         const int src13= src[13*srcStride];\
 916         const int src14= src[14*srcStride];\
 917         const int src15= src[15*srcStride];\
 918         const int src16= src[16*srcStride];\
 919         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 920         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 921         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 922         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 923         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 924         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 925         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 926         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 927         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 928         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 929         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 930         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 931         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 932         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 933         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 934         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 935         dst++;\
 936         src++;\
 937     }\
 938 }\
 939 \
 940 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
 941     uint8_t half[64];\
 942     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 943     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 944 }\
 945 \
 946 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
 947     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 948 }\
 949 \
 950 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
 951     uint8_t half[64];\
 952     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 953     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 954 }\
 955 \
 956 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
 957     uint8_t full[16*9];\
 958     uint8_t half[64];\
 959     copy_block9(full, src, 16, stride, 9);\
 960     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 961     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 962 }\
 963 \
 964 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
 965     uint8_t full[16*9];\
 966     copy_block9(full, src, 16, stride, 9);\
 967     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 968 }\
 969 \
 970 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
 971     uint8_t full[16*9];\
 972     uint8_t half[64];\
 973     copy_block9(full, src, 16, stride, 9);\
 974     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 975     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 976 }\
 977 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
 978     uint8_t full[16*9];\
 979     uint8_t halfH[72];\
 980     uint8_t halfV[64];\
 981     uint8_t halfHV[64];\
 982     copy_block9(full, src, 16, stride, 9);\
 983     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 984     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
 985     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 986     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
 987 }\
 988 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
 989     uint8_t full[16*9];\
 990     uint8_t halfH[72];\
 991     uint8_t halfHV[64];\
 992     copy_block9(full, src, 16, stride, 9);\
 993     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
 994     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
 995     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
 996     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
 997 }\
 998 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
 999     uint8_t full[16*9];\
1000     uint8_t halfH[72];\
1001     uint8_t halfV[64];\
1002     uint8_t halfHV[64];\
1003     copy_block9(full, src, 16, stride, 9);\
1004     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1005     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1006     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1007     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1008 }\
1009 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1010     uint8_t full[16*9];\
1011     uint8_t halfH[72];\
1012     uint8_t halfHV[64];\
1013     copy_block9(full, src, 16, stride, 9);\
1014     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1015     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1016     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1017     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1018 }\
1019 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1020     uint8_t full[16*9];\
1021     uint8_t halfH[72];\
1022     uint8_t halfV[64];\
1023     uint8_t halfHV[64];\
1024     copy_block9(full, src, 16, stride, 9);\
1025     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1026     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1027     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1028     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1029 }\
1030 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1031     uint8_t full[16*9];\
1032     uint8_t halfH[72];\
1033     uint8_t halfHV[64];\
1034     copy_block9(full, src, 16, stride, 9);\
1035     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1036     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1037     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1038     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1039 }\
1040 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1041     uint8_t full[16*9];\
1042     uint8_t halfH[72];\
1043     uint8_t halfV[64];\
1044     uint8_t halfHV[64];\
1045     copy_block9(full, src, 16, stride, 9);\
1046     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1047     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1048     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1049     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1050 }\
1051 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1052     uint8_t full[16*9];\
1053     uint8_t halfH[72];\
1054     uint8_t halfHV[64];\
1055     copy_block9(full, src, 16, stride, 9);\
1056     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1057     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1058     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1059     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1060 }\
1061 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1062     uint8_t halfH[72];\
1063     uint8_t halfHV[64];\
1064     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1065     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1066     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1067 }\
1068 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1069     uint8_t halfH[72];\
1070     uint8_t halfHV[64];\
1071     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1072     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1073     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1074 }\
1075 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1076     uint8_t full[16*9];\
1077     uint8_t halfH[72];\
1078     uint8_t halfV[64];\
1079     uint8_t halfHV[64];\
1080     copy_block9(full, src, 16, stride, 9);\
1081     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1082     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1083     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1084     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1085 }\
1086 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1087     uint8_t full[16*9];\
1088     uint8_t halfH[72];\
1089     copy_block9(full, src, 16, stride, 9);\
1090     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1091     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1092     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1093 }\
1094 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1095     uint8_t full[16*9];\
1096     uint8_t halfH[72];\
1097     uint8_t halfV[64];\
1098     uint8_t halfHV[64];\
1099     copy_block9(full, src, 16, stride, 9);\
1100     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1101     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1102     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1103     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1104 }\
1105 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1106     uint8_t full[16*9];\
1107     uint8_t halfH[72];\
1108     copy_block9(full, src, 16, stride, 9);\
1109     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1110     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1111     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1112 }\
1113 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1114     uint8_t halfH[72];\
1115     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1116     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1117 }\
1118 \
1119 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1120     uint8_t half[256];\
1121     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1122     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1123 }\
1124 \
1125 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1126     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1127 }\
1128 \
1129 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1130     uint8_t half[256];\
1131     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1132     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1133 }\
1134 \
1135 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1136     uint8_t full[24*17];\
1137     uint8_t half[256];\
1138     copy_block17(full, src, 24, stride, 17);\
1139     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1140     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1141 }\
1142 \
1143 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1144     uint8_t full[24*17];\
1145     copy_block17(full, src, 24, stride, 17);\
1146     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1147 }\
1148 \
1149 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1150     uint8_t full[24*17];\
1151     uint8_t half[256];\
1152     copy_block17(full, src, 24, stride, 17);\
1153     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1154     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1155 }\
1156 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1157     uint8_t full[24*17];\
1158     uint8_t halfH[272];\
1159     uint8_t halfV[256];\
1160     uint8_t halfHV[256];\
1161     copy_block17(full, src, 24, stride, 17);\
1162     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1163     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1164     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1165     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1166 }\
1167 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1168     uint8_t full[24*17];\
1169     uint8_t halfH[272];\
1170     uint8_t halfHV[256];\
1171     copy_block17(full, src, 24, stride, 17);\
1172     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1173     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1174     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1175     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1176 }\
1177 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1178     uint8_t full[24*17];\
1179     uint8_t halfH[272];\
1180     uint8_t halfV[256];\
1181     uint8_t halfHV[256];\
1182     copy_block17(full, src, 24, stride, 17);\
1183     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1184     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1185     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1186     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1187 }\
1188 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1189     uint8_t full[24*17];\
1190     uint8_t halfH[272];\
1191     uint8_t halfHV[256];\
1192     copy_block17(full, src, 24, stride, 17);\
1193     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1194     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1195     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1196     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1197 }\
1198 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1199     uint8_t full[24*17];\
1200     uint8_t halfH[272];\
1201     uint8_t halfV[256];\
1202     uint8_t halfHV[256];\
1203     copy_block17(full, src, 24, stride, 17);\
1204     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1205     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1206     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1207     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1208 }\
1209 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1210     uint8_t full[24*17];\
1211     uint8_t halfH[272];\
1212     uint8_t halfHV[256];\
1213     copy_block17(full, src, 24, stride, 17);\
1214     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1215     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1216     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1217     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1218 }\
1219 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1220     uint8_t full[24*17];\
1221     uint8_t halfH[272];\
1222     uint8_t halfV[256];\
1223     uint8_t halfHV[256];\
1224     copy_block17(full, src, 24, stride, 17);\
1225     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1226     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1227     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1228     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1229 }\
1230 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1231     uint8_t full[24*17];\
1232     uint8_t halfH[272];\
1233     uint8_t halfHV[256];\
1234     copy_block17(full, src, 24, stride, 17);\
1235     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1236     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1237     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1238     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1239 }\
1240 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1241     uint8_t halfH[272];\
1242     uint8_t halfHV[256];\
1243     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1244     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1245     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1246 }\
1247 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1248     uint8_t halfH[272];\
1249     uint8_t halfHV[256];\
1250     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1251     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1252     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1253 }\
1254 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1255     uint8_t full[24*17];\
1256     uint8_t halfH[272];\
1257     uint8_t halfV[256];\
1258     uint8_t halfHV[256];\
1259     copy_block17(full, src, 24, stride, 17);\
1260     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1261     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1262     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1263     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1264 }\
1265 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1266     uint8_t full[24*17];\
1267     uint8_t halfH[272];\
1268     copy_block17(full, src, 24, stride, 17);\
1269     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1270     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1271     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1272 }\
1273 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1274     uint8_t full[24*17];\
1275     uint8_t halfH[272];\
1276     uint8_t halfV[256];\
1277     uint8_t halfHV[256];\
1278     copy_block17(full, src, 24, stride, 17);\
1279     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1280     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1281     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1282     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1283 }\
1284 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1285     uint8_t full[24*17];\
1286     uint8_t halfH[272];\
1287     copy_block17(full, src, 24, stride, 17);\
1288     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1289     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1290     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1291 }\
1292 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1293     uint8_t halfH[272];\
1294     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1295     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1296 }
1297
1298 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1299 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1300 #define op_put(a, b) a = cm[((b) + 16)>>5]
1301 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1302
1303 QPEL_MC(0, put_       , _       , op_put)
1304 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1305 QPEL_MC(0, avg_       , _       , op_avg)
1306 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1307 #undef op_avg
1308 #undef op_avg_no_rnd
1309 #undef op_put
1310 #undef op_put_no_rnd
1311
1312 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1313 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1314 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1315 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1316 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1317 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1318
1319 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1320     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1321     int i;
1322
1323     for(i=0; i<h; i++){
1324         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1325         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1326         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1327         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1328         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1329         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1330         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1331         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1332         dst+=dstStride;
1333         src+=srcStride;
1334     }
1335 }
1336
1337 #if CONFIG_RV40_DECODER
1338 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1339     put_pixels16_xy2_8_c(dst, src, stride, 16);
1340 }
1341 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1342     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1343 }
1344 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1345     put_pixels8_xy2_8_c(dst, src, stride, 8);
1346 }
1347 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1348     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1349 }
1350 #endif /* CONFIG_RV40_DECODER */
1351
1352 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1353     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1354     int i;
1355
1356     for(i=0; i<w; i++){
1357         const int src_1= src[ -srcStride];
1358         const int src0 = src[0          ];
1359         const int src1 = src[  srcStride];
1360         const int src2 = src[2*srcStride];
1361         const int src3 = src[3*srcStride];
1362         const int src4 = src[4*srcStride];
1363         const int src5 = src[5*srcStride];
1364         const int src6 = src[6*srcStride];
1365         const int src7 = src[7*srcStride];
1366         const int src8 = src[8*srcStride];
1367         const int src9 = src[9*srcStride];
1368         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1369         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1370         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1371         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1372         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1373         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1374         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1375         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1376         src++;
1377         dst++;
1378     }
1379 }
1380
1381 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1382     uint8_t half[64];
1383     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1384     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1385 }
1386
1387 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1388     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1389 }
1390
1391 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1392     uint8_t half[64];
1393     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1394     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1395 }
1396
1397 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1398     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1399 }
1400
1401 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1402     uint8_t halfH[88];
1403     uint8_t halfV[64];
1404     uint8_t halfHV[64];
1405     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1406     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1407     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1408     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1409 }
1410 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1411     uint8_t halfH[88];
1412     uint8_t halfV[64];
1413     uint8_t halfHV[64];
1414     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1415     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1416     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1417     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1418 }
1419 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1420     uint8_t halfH[88];
1421     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1422     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1423 }
1424
1425 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1426     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1427     int x;
1428     const int strength= ff_h263_loop_filter_strength[qscale];
1429
1430     for(x=0; x<8; x++){
1431         int d1, d2, ad1;
1432         int p0= src[x-2*stride];
1433         int p1= src[x-1*stride];
1434         int p2= src[x+0*stride];
1435         int p3= src[x+1*stride];
1436         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1437
1438         if     (d<-2*strength) d1= 0;
1439         else if(d<-  strength) d1=-2*strength - d;
1440         else if(d<   strength) d1= d;
1441         else if(d< 2*strength) d1= 2*strength - d;
1442         else                   d1= 0;
1443
1444         p1 += d1;
1445         p2 -= d1;
1446         if(p1&256) p1= ~(p1>>31);
1447         if(p2&256) p2= ~(p2>>31);
1448
1449         src[x-1*stride] = p1;
1450         src[x+0*stride] = p2;
1451
1452         ad1= FFABS(d1)>>1;
1453
1454         d2= av_clip((p0-p3)/4, -ad1, ad1);
1455
1456         src[x-2*stride] = p0 - d2;
1457         src[x+  stride] = p3 + d2;
1458     }
1459     }
1460 }
1461
1462 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1463     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1464     int y;
1465     const int strength= ff_h263_loop_filter_strength[qscale];
1466
1467     for(y=0; y<8; y++){
1468         int d1, d2, ad1;
1469         int p0= src[y*stride-2];
1470         int p1= src[y*stride-1];
1471         int p2= src[y*stride+0];
1472         int p3= src[y*stride+1];
1473         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1474
1475         if     (d<-2*strength) d1= 0;
1476         else if(d<-  strength) d1=-2*strength - d;
1477         else if(d<   strength) d1= d;
1478         else if(d< 2*strength) d1= 2*strength - d;
1479         else                   d1= 0;
1480
1481         p1 += d1;
1482         p2 -= d1;
1483         if(p1&256) p1= ~(p1>>31);
1484         if(p2&256) p2= ~(p2>>31);
1485
1486         src[y*stride-1] = p1;
1487         src[y*stride+0] = p2;
1488
1489         ad1= FFABS(d1)>>1;
1490
1491         d2= av_clip((p0-p3)/4, -ad1, ad1);
1492
1493         src[y*stride-2] = p0 - d2;
1494         src[y*stride+1] = p3 + d2;
1495     }
1496     }
1497 }
1498
1499 static void h261_loop_filter_c(uint8_t *src, int stride){
1500     int x,y,xy,yz;
1501     int temp[64];
1502
1503     for(x=0; x<8; x++){
1504         temp[x      ] = 4*src[x           ];
1505         temp[x + 7*8] = 4*src[x + 7*stride];
1506     }
1507     for(y=1; y<7; y++){
1508         for(x=0; x<8; x++){
1509             xy = y * stride + x;
1510             yz = y * 8 + x;
1511             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1512         }
1513     }
1514
1515     for(y=0; y<8; y++){
1516         src[  y*stride] = (temp[  y*8] + 2)>>2;
1517         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1518         for(x=1; x<7; x++){
1519             xy = y * stride + x;
1520             yz = y * 8 + x;
1521             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1522         }
1523     }
1524 }
1525
1526 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1527 {
1528     int s, i;
1529
1530     s = 0;
1531     for(i=0;i<h;i++) {
1532         s += abs(pix1[0] - pix2[0]);
1533         s += abs(pix1[1] - pix2[1]);
1534         s += abs(pix1[2] - pix2[2]);
1535         s += abs(pix1[3] - pix2[3]);
1536         s += abs(pix1[4] - pix2[4]);
1537         s += abs(pix1[5] - pix2[5]);
1538         s += abs(pix1[6] - pix2[6]);
1539         s += abs(pix1[7] - pix2[7]);
1540         s += abs(pix1[8] - pix2[8]);
1541         s += abs(pix1[9] - pix2[9]);
1542         s += abs(pix1[10] - pix2[10]);
1543         s += abs(pix1[11] - pix2[11]);
1544         s += abs(pix1[12] - pix2[12]);
1545         s += abs(pix1[13] - pix2[13]);
1546         s += abs(pix1[14] - pix2[14]);
1547         s += abs(pix1[15] - pix2[15]);
1548         pix1 += line_size;
1549         pix2 += line_size;
1550     }
1551     return s;
1552 }
1553
1554 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1555 {
1556     int s, i;
1557
1558     s = 0;
1559     for(i=0;i<h;i++) {
1560         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1561         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1562         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1563         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1564         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1565         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1566         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1567         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1568         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1569         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1570         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1571         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1572         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1573         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1574         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1575         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1576         pix1 += line_size;
1577         pix2 += line_size;
1578     }
1579     return s;
1580 }
1581
1582 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1583 {
1584     int s, i;
1585     uint8_t *pix3 = pix2 + line_size;
1586
1587     s = 0;
1588     for(i=0;i<h;i++) {
1589         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1590         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1591         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1592         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1593         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1594         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1595         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1596         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1597         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1598         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1599         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1600         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1601         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1602         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1603         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1604         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1605         pix1 += line_size;
1606         pix2 += line_size;
1607         pix3 += line_size;
1608     }
1609     return s;
1610 }
1611
1612 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1613 {
1614     int s, i;
1615     uint8_t *pix3 = pix2 + line_size;
1616
1617     s = 0;
1618     for(i=0;i<h;i++) {
1619         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1620         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1621         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1622         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1623         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1624         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1625         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1626         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1627         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1628         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1629         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1630         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1631         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1632         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1633         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1634         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1635         pix1 += line_size;
1636         pix2 += line_size;
1637         pix3 += line_size;
1638     }
1639     return s;
1640 }
1641
1642 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1643 {
1644     int s, i;
1645
1646     s = 0;
1647     for(i=0;i<h;i++) {
1648         s += abs(pix1[0] - pix2[0]);
1649         s += abs(pix1[1] - pix2[1]);
1650         s += abs(pix1[2] - pix2[2]);
1651         s += abs(pix1[3] - pix2[3]);
1652         s += abs(pix1[4] - pix2[4]);
1653         s += abs(pix1[5] - pix2[5]);
1654         s += abs(pix1[6] - pix2[6]);
1655         s += abs(pix1[7] - pix2[7]);
1656         pix1 += line_size;
1657         pix2 += line_size;
1658     }
1659     return s;
1660 }
1661
1662 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1663 {
1664     int s, i;
1665
1666     s = 0;
1667     for(i=0;i<h;i++) {
1668         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1669         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1670         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1671         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1672         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1673         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1674         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1675         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1676         pix1 += line_size;
1677         pix2 += line_size;
1678     }
1679     return s;
1680 }
1681
1682 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1683 {
1684     int s, i;
1685     uint8_t *pix3 = pix2 + line_size;
1686
1687     s = 0;
1688     for(i=0;i<h;i++) {
1689         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1690         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1691         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1692         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1693         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1694         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1695         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1696         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1697         pix1 += line_size;
1698         pix2 += line_size;
1699         pix3 += line_size;
1700     }
1701     return s;
1702 }
1703
1704 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1705 {
1706     int s, i;
1707     uint8_t *pix3 = pix2 + line_size;
1708
1709     s = 0;
1710     for(i=0;i<h;i++) {
1711         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1712         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1713         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1714         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1715         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1716         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1717         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1718         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1719         pix1 += line_size;
1720         pix2 += line_size;
1721         pix3 += line_size;
1722     }
1723     return s;
1724 }
1725
1726 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1727     MpegEncContext *c = v;
1728     int score1=0;
1729     int score2=0;
1730     int x,y;
1731
1732     for(y=0; y<h; y++){
1733         for(x=0; x<16; x++){
1734             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1735         }
1736         if(y+1<h){
1737             for(x=0; x<15; x++){
1738                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1739                              - s1[x+1] + s1[x+1+stride])
1740                         -FFABS(  s2[x  ] - s2[x  +stride]
1741                              - s2[x+1] + s2[x+1+stride]);
1742             }
1743         }
1744         s1+= stride;
1745         s2+= stride;
1746     }
1747
1748     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1749     else  return score1 + FFABS(score2)*8;
1750 }
1751
1752 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1753     MpegEncContext *c = v;
1754     int score1=0;
1755     int score2=0;
1756     int x,y;
1757
1758     for(y=0; y<h; y++){
1759         for(x=0; x<8; x++){
1760             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1761         }
1762         if(y+1<h){
1763             for(x=0; x<7; x++){
1764                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1765                              - s1[x+1] + s1[x+1+stride])
1766                         -FFABS(  s2[x  ] - s2[x  +stride]
1767                              - s2[x+1] + s2[x+1+stride]);
1768             }
1769         }
1770         s1+= stride;
1771         s2+= stride;
1772     }
1773
1774     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1775     else  return score1 + FFABS(score2)*8;
1776 }
1777
1778 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1779     int i;
1780     unsigned int sum=0;
1781
1782     for(i=0; i<8*8; i++){
1783         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1784         int w= weight[i];
1785         b>>= RECON_SHIFT;
1786         assert(-512<b && b<512);
1787
1788         sum += (w*b)*(w*b)>>4;
1789     }
1790     return sum>>2;
1791 }
1792
1793 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1794     int i;
1795
1796     for(i=0; i<8*8; i++){
1797         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1798     }
1799 }
1800
1801 /**
1802  * permutes an 8x8 block.
1803  * @param block the block which will be permuted according to the given permutation vector
1804  * @param permutation the permutation vector
1805  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1806  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1807  *                  (inverse) permutated to scantable order!
1808  */
1809 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1810 {
1811     int i;
1812     DCTELEM temp[64];
1813
1814     if(last<=0) return;
1815     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1816
1817     for(i=0; i<=last; i++){
1818         const int j= scantable[i];
1819         temp[j]= block[j];
1820         block[j]=0;
1821     }
1822
1823     for(i=0; i<=last; i++){
1824         const int j= scantable[i];
1825         const int perm_j= permutation[j];
1826         block[perm_j]= temp[j];
1827     }
1828 }
1829
1830 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1831     return 0;
1832 }
1833
1834 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1835     int i;
1836
1837     memset(cmp, 0, sizeof(void*)*6);
1838
1839     for(i=0; i<6; i++){
1840         switch(type&0xFF){
1841         case FF_CMP_SAD:
1842             cmp[i]= c->sad[i];
1843             break;
1844         case FF_CMP_SATD:
1845             cmp[i]= c->hadamard8_diff[i];
1846             break;
1847         case FF_CMP_SSE:
1848             cmp[i]= c->sse[i];
1849             break;
1850         case FF_CMP_DCT:
1851             cmp[i]= c->dct_sad[i];
1852             break;
1853         case FF_CMP_DCT264:
1854             cmp[i]= c->dct264_sad[i];
1855             break;
1856         case FF_CMP_DCTMAX:
1857             cmp[i]= c->dct_max[i];
1858             break;
1859         case FF_CMP_PSNR:
1860             cmp[i]= c->quant_psnr[i];
1861             break;
1862         case FF_CMP_BIT:
1863             cmp[i]= c->bit[i];
1864             break;
1865         case FF_CMP_RD:
1866             cmp[i]= c->rd[i];
1867             break;
1868         case FF_CMP_VSAD:
1869             cmp[i]= c->vsad[i];
1870             break;
1871         case FF_CMP_VSSE:
1872             cmp[i]= c->vsse[i];
1873             break;
1874         case FF_CMP_ZERO:
1875             cmp[i]= zero_cmp;
1876             break;
1877         case FF_CMP_NSSE:
1878             cmp[i]= c->nsse[i];
1879             break;
1880 #if CONFIG_DWT
1881         case FF_CMP_W53:
1882             cmp[i]= c->w53[i];
1883             break;
1884         case FF_CMP_W97:
1885             cmp[i]= c->w97[i];
1886             break;
1887 #endif
1888         default:
1889             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1890         }
1891     }
1892 }
1893
1894 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1895     long i;
1896     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1897         long a = *(long*)(src+i);
1898         long b = *(long*)(dst+i);
1899         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1900     }
1901     for(; i<w; i++)
1902         dst[i+0] += src[i+0];
1903 }
1904
1905 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1906     long i;
1907 #if !HAVE_FAST_UNALIGNED
1908     if((long)src2 & (sizeof(long)-1)){
1909         for(i=0; i+7<w; i+=8){
1910             dst[i+0] = src1[i+0]-src2[i+0];
1911             dst[i+1] = src1[i+1]-src2[i+1];
1912             dst[i+2] = src1[i+2]-src2[i+2];
1913             dst[i+3] = src1[i+3]-src2[i+3];
1914             dst[i+4] = src1[i+4]-src2[i+4];
1915             dst[i+5] = src1[i+5]-src2[i+5];
1916             dst[i+6] = src1[i+6]-src2[i+6];
1917             dst[i+7] = src1[i+7]-src2[i+7];
1918         }
1919     }else
1920 #endif
1921     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1922         long a = *(long*)(src1+i);
1923         long b = *(long*)(src2+i);
1924         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1925     }
1926     for(; i<w; i++)
1927         dst[i+0] = src1[i+0]-src2[i+0];
1928 }
1929
1930 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1931     int i;
1932     uint8_t l, lt;
1933
1934     l= *left;
1935     lt= *left_top;
1936
1937     for(i=0; i<w; i++){
1938         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1939         lt= src1[i];
1940         dst[i]= l;
1941     }
1942
1943     *left= l;
1944     *left_top= lt;
1945 }
1946
1947 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1948     int i;
1949     uint8_t l, lt;
1950
1951     l= *left;
1952     lt= *left_top;
1953
1954     for(i=0; i<w; i++){
1955         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1956         lt= src1[i];
1957         l= src2[i];
1958         dst[i]= l - pred;
1959     }
1960
1961     *left= l;
1962     *left_top= lt;
1963 }
1964
1965 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1966     int i;
1967
1968     for(i=0; i<w-1; i++){
1969         acc+= src[i];
1970         dst[i]= acc;
1971         i++;
1972         acc+= src[i];
1973         dst[i]= acc;
1974     }
1975
1976     for(; i<w; i++){
1977         acc+= src[i];
1978         dst[i]= acc;
1979     }
1980
1981     return acc;
1982 }
1983
1984 #if HAVE_BIGENDIAN
1985 #define B 3
1986 #define G 2
1987 #define R 1
1988 #define A 0
1989 #else
1990 #define B 0
1991 #define G 1
1992 #define R 2
1993 #define A 3
1994 #endif
1995 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1996     int i;
1997     int r,g,b,a;
1998     r= *red;
1999     g= *green;
2000     b= *blue;
2001     a= *alpha;
2002
2003     for(i=0; i<w; i++){
2004         b+= src[4*i+B];
2005         g+= src[4*i+G];
2006         r+= src[4*i+R];
2007         a+= src[4*i+A];
2008
2009         dst[4*i+B]= b;
2010         dst[4*i+G]= g;
2011         dst[4*i+R]= r;
2012         dst[4*i+A]= a;
2013     }
2014
2015     *red= r;
2016     *green= g;
2017     *blue= b;
2018     *alpha= a;
2019 }
2020 #undef B
2021 #undef G
2022 #undef R
2023 #undef A
2024
2025 #define BUTTERFLY2(o1,o2,i1,i2) \
2026 o1= (i1)+(i2);\
2027 o2= (i1)-(i2);
2028
2029 #define BUTTERFLY1(x,y) \
2030 {\
2031     int a,b;\
2032     a= x;\
2033     b= y;\
2034     x= a+b;\
2035     y= a-b;\
2036 }
2037
2038 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2039
2040 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2041     int i;
2042     int temp[64];
2043     int sum=0;
2044
2045     assert(h==8);
2046
2047     for(i=0; i<8; i++){
2048         //FIXME try pointer walks
2049         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2050         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2051         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2052         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2053
2054         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2055         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2056         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2057         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2058
2059         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2060         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2061         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2062         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2063     }
2064
2065     for(i=0; i<8; i++){
2066         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2067         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2068         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2069         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2070
2071         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2072         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2073         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2074         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2075
2076         sum +=
2077              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2078             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2079             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2080             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2081     }
2082     return sum;
2083 }
2084
2085 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2086     int i;
2087     int temp[64];
2088     int sum=0;
2089
2090     assert(h==8);
2091
2092     for(i=0; i<8; i++){
2093         //FIXME try pointer walks
2094         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2095         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2096         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2097         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2098
2099         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2100         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2101         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2102         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2103
2104         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2105         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2106         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2107         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2108     }
2109
2110     for(i=0; i<8; i++){
2111         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2112         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2113         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2114         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2115
2116         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2117         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2118         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2119         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2120
2121         sum +=
2122              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2123             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2124             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2125             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2126     }
2127
2128     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2129
2130     return sum;
2131 }
2132
2133 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2134     MpegEncContext * const s= (MpegEncContext *)c;
2135     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2136
2137     assert(h==8);
2138
2139     s->dsp.diff_pixels(temp, src1, src2, stride);
2140     s->dsp.fdct(temp);
2141     return s->dsp.sum_abs_dctelem(temp);
2142 }
2143
2144 #if CONFIG_GPL
2145 #define DCT8_1D {\
2146     const int s07 = SRC(0) + SRC(7);\
2147     const int s16 = SRC(1) + SRC(6);\
2148     const int s25 = SRC(2) + SRC(5);\
2149     const int s34 = SRC(3) + SRC(4);\
2150     const int a0 = s07 + s34;\
2151     const int a1 = s16 + s25;\
2152     const int a2 = s07 - s34;\
2153     const int a3 = s16 - s25;\
2154     const int d07 = SRC(0) - SRC(7);\
2155     const int d16 = SRC(1) - SRC(6);\
2156     const int d25 = SRC(2) - SRC(5);\
2157     const int d34 = SRC(3) - SRC(4);\
2158     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2159     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2160     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2161     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2162     DST(0,  a0 + a1     ) ;\
2163     DST(1,  a4 + (a7>>2)) ;\
2164     DST(2,  a2 + (a3>>1)) ;\
2165     DST(3,  a5 + (a6>>2)) ;\
2166     DST(4,  a0 - a1     ) ;\
2167     DST(5,  a6 - (a5>>2)) ;\
2168     DST(6, (a2>>1) - a3 ) ;\
2169     DST(7, (a4>>2) - a7 ) ;\
2170 }
2171
2172 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2173     MpegEncContext * const s= (MpegEncContext *)c;
2174     DCTELEM dct[8][8];
2175     int i;
2176     int sum=0;
2177
2178     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2179
2180 #define SRC(x) dct[i][x]
2181 #define DST(x,v) dct[i][x]= v
2182     for( i = 0; i < 8; i++ )
2183         DCT8_1D
2184 #undef SRC
2185 #undef DST
2186
2187 #define SRC(x) dct[x][i]
2188 #define DST(x,v) sum += FFABS(v)
2189     for( i = 0; i < 8; i++ )
2190         DCT8_1D
2191 #undef SRC
2192 #undef DST
2193     return sum;
2194 }
2195 #endif
2196
2197 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2198     MpegEncContext * const s= (MpegEncContext *)c;
2199     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2200     int sum=0, i;
2201
2202     assert(h==8);
2203
2204     s->dsp.diff_pixels(temp, src1, src2, stride);
2205     s->dsp.fdct(temp);
2206
2207     for(i=0; i<64; i++)
2208         sum= FFMAX(sum, FFABS(temp[i]));
2209
2210     return sum;
2211 }
2212
2213 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2214     MpegEncContext * const s= (MpegEncContext *)c;
2215     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2216     DCTELEM * const bak = temp+64;
2217     int sum=0, i;
2218
2219     assert(h==8);
2220     s->mb_intra=0;
2221
2222     s->dsp.diff_pixels(temp, src1, src2, stride);
2223
2224     memcpy(bak, temp, 64*sizeof(DCTELEM));
2225
2226     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2227     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2228     ff_simple_idct_8(temp); //FIXME
2229
2230     for(i=0; i<64; i++)
2231         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2232
2233     return sum;
2234 }
2235
2236 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2237     MpegEncContext * const s= (MpegEncContext *)c;
2238     const uint8_t *scantable= s->intra_scantable.permutated;
2239     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2240     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2241     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2242     int i, last, run, bits, level, distortion, start_i;
2243     const int esc_length= s->ac_esc_length;
2244     uint8_t * length;
2245     uint8_t * last_length;
2246
2247     assert(h==8);
2248
2249     copy_block8(lsrc1, src1, 8, stride, 8);
2250     copy_block8(lsrc2, src2, 8, stride, 8);
2251
2252     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2253
2254     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2255
2256     bits=0;
2257
2258     if (s->mb_intra) {
2259         start_i = 1;
2260         length     = s->intra_ac_vlc_length;
2261         last_length= s->intra_ac_vlc_last_length;
2262         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2263     } else {
2264         start_i = 0;
2265         length     = s->inter_ac_vlc_length;
2266         last_length= s->inter_ac_vlc_last_length;
2267     }
2268
2269     if(last>=start_i){
2270         run=0;
2271         for(i=start_i; i<last; i++){
2272             int j= scantable[i];
2273             level= temp[j];
2274
2275             if(level){
2276                 level+=64;
2277                 if((level&(~127)) == 0){
2278                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2279                 }else
2280                     bits+= esc_length;
2281                 run=0;
2282             }else
2283                 run++;
2284         }
2285         i= scantable[last];
2286
2287         level= temp[i] + 64;
2288
2289         assert(level - 64);
2290
2291         if((level&(~127)) == 0){
2292             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2293         }else
2294             bits+= esc_length;
2295
2296     }
2297
2298     if(last>=0){
2299         if(s->mb_intra)
2300             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2301         else
2302             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2303     }
2304
2305     s->dsp.idct_add(lsrc2, 8, temp);
2306
2307     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2308
2309     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2310 }
2311
2312 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2313     MpegEncContext * const s= (MpegEncContext *)c;
2314     const uint8_t *scantable= s->intra_scantable.permutated;
2315     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2316     int i, last, run, bits, level, start_i;
2317     const int esc_length= s->ac_esc_length;
2318     uint8_t * length;
2319     uint8_t * last_length;
2320
2321     assert(h==8);
2322
2323     s->dsp.diff_pixels(temp, src1, src2, stride);
2324
2325     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2326
2327     bits=0;
2328
2329     if (s->mb_intra) {
2330         start_i = 1;
2331         length     = s->intra_ac_vlc_length;
2332         last_length= s->intra_ac_vlc_last_length;
2333         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2334     } else {
2335         start_i = 0;
2336         length     = s->inter_ac_vlc_length;
2337         last_length= s->inter_ac_vlc_last_length;
2338     }
2339
2340     if(last>=start_i){
2341         run=0;
2342         for(i=start_i; i<last; i++){
2343             int j= scantable[i];
2344             level= temp[j];
2345
2346             if(level){
2347                 level+=64;
2348                 if((level&(~127)) == 0){
2349                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2350                 }else
2351                     bits+= esc_length;
2352                 run=0;
2353             }else
2354                 run++;
2355         }
2356         i= scantable[last];
2357
2358         level= temp[i] + 64;
2359
2360         assert(level - 64);
2361
2362         if((level&(~127)) == 0){
2363             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2364         }else
2365             bits+= esc_length;
2366     }
2367
2368     return bits;
2369 }
2370
2371 #define VSAD_INTRA(size) \
2372 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2373     int score=0;                                                                                            \
2374     int x,y;                                                                                                \
2375                                                                                                             \
2376     for(y=1; y<h; y++){                                                                                     \
2377         for(x=0; x<size; x+=4){                                                                             \
2378             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2379                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2380         }                                                                                                   \
2381         s+= stride;                                                                                         \
2382     }                                                                                                       \
2383                                                                                                             \
2384     return score;                                                                                           \
2385 }
2386 VSAD_INTRA(8)
2387 VSAD_INTRA(16)
2388
2389 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2390     int score=0;
2391     int x,y;
2392
2393     for(y=1; y<h; y++){
2394         for(x=0; x<16; x++){
2395             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2396         }
2397         s1+= stride;
2398         s2+= stride;
2399     }
2400
2401     return score;
2402 }
2403
2404 #define SQ(a) ((a)*(a))
2405 #define VSSE_INTRA(size) \
2406 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2407     int score=0;                                                                                            \
2408     int x,y;                                                                                                \
2409                                                                                                             \
2410     for(y=1; y<h; y++){                                                                                     \
2411         for(x=0; x<size; x+=4){                                                                               \
2412             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2413                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2414         }                                                                                                   \
2415         s+= stride;                                                                                         \
2416     }                                                                                                       \
2417                                                                                                             \
2418     return score;                                                                                           \
2419 }
2420 VSSE_INTRA(8)
2421 VSSE_INTRA(16)
2422
2423 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2424     int score=0;
2425     int x,y;
2426
2427     for(y=1; y<h; y++){
2428         for(x=0; x<16; x++){
2429             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2430         }
2431         s1+= stride;
2432         s2+= stride;
2433     }
2434
2435     return score;
2436 }
2437
2438 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2439                                int size){
2440     int score=0;
2441     int i;
2442     for(i=0; i<size; i++)
2443         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2444     return score;
2445 }
2446
2447 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2448 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2449 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2450 #if CONFIG_GPL
2451 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2452 #endif
2453 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2454 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2455 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2456 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2457
2458 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2459     int i;
2460     for(i=0; i<len; i++)
2461         dst[i] = src0[i] * src1[i];
2462 }
2463
2464 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2465     int i;
2466     src1 += len-1;
2467     for(i=0; i<len; i++)
2468         dst[i] = src0[i] * src1[-i];
2469 }
2470
2471 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2472     int i;
2473     for(i=0; i<len; i++)
2474         dst[i] = src0[i] * src1[i] + src2[i];
2475 }
2476
2477 static void vector_fmul_window_c(float *dst, const float *src0,
2478                                  const float *src1, const float *win, int len)
2479 {
2480     int i,j;
2481     dst += len;
2482     win += len;
2483     src0+= len;
2484     for(i=-len, j=len-1; i<0; i++, j--) {
2485         float s0 = src0[i];
2486         float s1 = src1[j];
2487         float wi = win[i];
2488         float wj = win[j];
2489         dst[i] = s0*wj - s1*wi;
2490         dst[j] = s0*wi + s1*wj;
2491     }
2492 }
2493
2494 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2495                                  int len)
2496 {
2497     int i;
2498     for (i = 0; i < len; i++)
2499         dst[i] = src[i] * mul;
2500 }
2501
2502 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
2503                                       const float **sv, float mul, int len)
2504 {
2505     int i;
2506     for (i = 0; i < len; i += 2, sv++) {
2507         dst[i  ] = src[i  ] * sv[0][0] * mul;
2508         dst[i+1] = src[i+1] * sv[0][1] * mul;
2509     }
2510 }
2511
2512 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
2513                                       const float **sv, float mul, int len)
2514 {
2515     int i;
2516     for (i = 0; i < len; i += 4, sv++) {
2517         dst[i  ] = src[i  ] * sv[0][0] * mul;
2518         dst[i+1] = src[i+1] * sv[0][1] * mul;
2519         dst[i+2] = src[i+2] * sv[0][2] * mul;
2520         dst[i+3] = src[i+3] * sv[0][3] * mul;
2521     }
2522 }
2523
2524 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
2525                                int len)
2526 {
2527     int i;
2528     for (i = 0; i < len; i += 2, sv++) {
2529         dst[i  ] = sv[0][0] * mul;
2530         dst[i+1] = sv[0][1] * mul;
2531     }
2532 }
2533
2534 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
2535                                int len)
2536 {
2537     int i;
2538     for (i = 0; i < len; i += 4, sv++) {
2539         dst[i  ] = sv[0][0] * mul;
2540         dst[i+1] = sv[0][1] * mul;
2541         dst[i+2] = sv[0][2] * mul;
2542         dst[i+3] = sv[0][3] * mul;
2543     }
2544 }
2545
2546 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2547                                 int len)
2548 {
2549     int i;
2550     for (i = 0; i < len; i++) {
2551         float t = v1[i] - v2[i];
2552         v1[i] += v2[i];
2553         v2[i] = t;
2554     }
2555 }
2556
2557 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2558 {
2559     float p = 0.0;
2560     int i;
2561
2562     for (i = 0; i < len; i++)
2563         p += v1[i] * v2[i];
2564
2565     return p;
2566 }
2567
2568 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2569                    uint32_t maxi, uint32_t maxisign)
2570 {
2571
2572     if(a > mini) return mini;
2573     else if((a^(1U<<31)) > maxisign) return maxi;
2574     else return a;
2575 }
2576
2577 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2578     int i;
2579     uint32_t mini = *(uint32_t*)min;
2580     uint32_t maxi = *(uint32_t*)max;
2581     uint32_t maxisign = maxi ^ (1U<<31);
2582     uint32_t *dsti = (uint32_t*)dst;
2583     const uint32_t *srci = (const uint32_t*)src;
2584     for(i=0; i<len; i+=8) {
2585         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2586         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2587         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2588         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2589         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2590         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2591         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2592         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2593     }
2594 }
2595 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2596     int i;
2597     if(min < 0 && max > 0) {
2598         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2599     } else {
2600         for(i=0; i < len; i+=8) {
2601             dst[i    ] = av_clipf(src[i    ], min, max);
2602             dst[i + 1] = av_clipf(src[i + 1], min, max);
2603             dst[i + 2] = av_clipf(src[i + 2], min, max);
2604             dst[i + 3] = av_clipf(src[i + 3], min, max);
2605             dst[i + 4] = av_clipf(src[i + 4], min, max);
2606             dst[i + 5] = av_clipf(src[i + 5], min, max);
2607             dst[i + 6] = av_clipf(src[i + 6], min, max);
2608             dst[i + 7] = av_clipf(src[i + 7], min, max);
2609         }
2610     }
2611 }
2612
2613 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2614 {
2615     int res = 0;
2616
2617     while (order--)
2618         res += (*v1++ * *v2++) >> shift;
2619
2620     return res;
2621 }
2622
2623 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2624 {
2625     int res = 0;
2626     while (order--) {
2627         res   += *v1 * *v2++;
2628         *v1++ += mul * *v3++;
2629     }
2630     return res;
2631 }
2632
2633 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2634                                  const int16_t *window, unsigned int len)
2635 {
2636     int i;
2637     int len2 = len >> 1;
2638
2639     for (i = 0; i < len2; i++) {
2640         int16_t w       = window[i];
2641         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2642         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2643     }
2644 }
2645
2646 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2647                                 int32_t max, unsigned int len)
2648 {
2649     do {
2650         *dst++ = av_clip(*src++, min, max);
2651         *dst++ = av_clip(*src++, min, max);
2652         *dst++ = av_clip(*src++, min, max);
2653         *dst++ = av_clip(*src++, min, max);
2654         *dst++ = av_clip(*src++, min, max);
2655         *dst++ = av_clip(*src++, min, max);
2656         *dst++ = av_clip(*src++, min, max);
2657         *dst++ = av_clip(*src++, min, max);
2658         len -= 8;
2659     } while (len > 0);
2660 }
2661
2662 #define W0 2048
2663 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2664 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2665 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2666 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2667 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2668 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2669 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2670
2671 static void wmv2_idct_row(short * b)
2672 {
2673     int s1,s2;
2674     int a0,a1,a2,a3,a4,a5,a6,a7;
2675     /*step 1*/
2676     a1 = W1*b[1]+W7*b[7];
2677     a7 = W7*b[1]-W1*b[7];
2678     a5 = W5*b[5]+W3*b[3];
2679     a3 = W3*b[5]-W5*b[3];
2680     a2 = W2*b[2]+W6*b[6];
2681     a6 = W6*b[2]-W2*b[6];
2682     a0 = W0*b[0]+W0*b[4];
2683     a4 = W0*b[0]-W0*b[4];
2684     /*step 2*/
2685     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2686     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2687     /*step 3*/
2688     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2689     b[1] = (a4+a6 +s1   + (1<<7))>>8;
2690     b[2] = (a4-a6 +s2   + (1<<7))>>8;
2691     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2692     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2693     b[5] = (a4-a6 -s2   + (1<<7))>>8;
2694     b[6] = (a4+a6 -s1   + (1<<7))>>8;
2695     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2696 }
2697 static void wmv2_idct_col(short * b)
2698 {
2699     int s1,s2;
2700     int a0,a1,a2,a3,a4,a5,a6,a7;
2701     /*step 1, with extended precision*/
2702     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2703     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2704     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2705     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2706     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2707     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2708     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2709     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2710     /*step 2*/
2711     s1 = (181*(a1-a5+a7-a3)+128)>>8;
2712     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2713     /*step 3*/
2714     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2715     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2716     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2717     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2718
2719     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2720     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2721     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2722     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2723 }
2724 void ff_wmv2_idct_c(short * block){
2725     int i;
2726
2727     for(i=0;i<64;i+=8){
2728         wmv2_idct_row(block+i);
2729     }
2730     for(i=0;i<8;i++){
2731         wmv2_idct_col(block+i);
2732     }
2733 }
2734 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2735  converted */
2736 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2737 {
2738     ff_wmv2_idct_c(block);
2739     ff_put_pixels_clamped_c(block, dest, line_size);
2740 }
2741 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2742 {
2743     ff_wmv2_idct_c(block);
2744     ff_add_pixels_clamped_c(block, dest, line_size);
2745 }
2746 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2747 {
2748     j_rev_dct (block);
2749     ff_put_pixels_clamped_c(block, dest, line_size);
2750 }
2751 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2752 {
2753     j_rev_dct (block);
2754     ff_add_pixels_clamped_c(block, dest, line_size);
2755 }
2756
2757 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2758 {
2759     j_rev_dct4 (block);
2760     put_pixels_clamped4_c(block, dest, line_size);
2761 }
2762 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2763 {
2764     j_rev_dct4 (block);
2765     add_pixels_clamped4_c(block, dest, line_size);
2766 }
2767
2768 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2769 {
2770     j_rev_dct2 (block);
2771     put_pixels_clamped2_c(block, dest, line_size);
2772 }
2773 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2774 {
2775     j_rev_dct2 (block);
2776     add_pixels_clamped2_c(block, dest, line_size);
2777 }
2778
2779 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2780 {
2781     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2782
2783     dest[0] = cm[(block[0] + 4)>>3];
2784 }
2785 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2786 {
2787     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2788
2789     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2790 }
2791
2792 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2793
2794 /* init static data */
2795 av_cold void dsputil_static_init(void)
2796 {
2797     int i;
2798
2799     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2800     for(i=0;i<MAX_NEG_CROP;i++) {
2801         ff_cropTbl[i] = 0;
2802         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2803     }
2804
2805     for(i=0;i<512;i++) {
2806         ff_squareTbl[i] = (i - 256) * (i - 256);
2807     }
2808
2809     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2810 }
2811
2812 int ff_check_alignment(void){
2813     static int did_fail=0;
2814     LOCAL_ALIGNED_16(int, aligned, [4]);
2815
2816     if((intptr_t)aligned & 15){
2817         if(!did_fail){
2818 #if HAVE_MMX || HAVE_ALTIVEC
2819             av_log(NULL, AV_LOG_ERROR,
2820                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2821                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2822                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2823                 "Do not report crashes to FFmpeg developers.\n");
2824 #endif
2825             did_fail=1;
2826         }
2827         return -1;
2828     }
2829     return 0;
2830 }
2831
2832 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2833 {
2834     int i;
2835
2836     ff_check_alignment();
2837
2838 #if CONFIG_ENCODERS
2839     if(avctx->dct_algo==FF_DCT_FASTINT) {
2840         c->fdct = fdct_ifast;
2841         c->fdct248 = fdct_ifast248;
2842     }
2843     else if(avctx->dct_algo==FF_DCT_FAAN) {
2844         c->fdct = ff_faandct;
2845         c->fdct248 = ff_faandct248;
2846     }
2847     else {
2848         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2849         c->fdct248 = ff_fdct248_islow;
2850     }
2851 #endif //CONFIG_ENCODERS
2852
2853     if(avctx->lowres==1){
2854         c->idct_put= ff_jref_idct4_put;
2855         c->idct_add= ff_jref_idct4_add;
2856         c->idct    = j_rev_dct4;
2857         c->idct_permutation_type= FF_NO_IDCT_PERM;
2858     }else if(avctx->lowres==2){
2859         c->idct_put= ff_jref_idct2_put;
2860         c->idct_add= ff_jref_idct2_add;
2861         c->idct    = j_rev_dct2;
2862         c->idct_permutation_type= FF_NO_IDCT_PERM;
2863     }else if(avctx->lowres==3){
2864         c->idct_put= ff_jref_idct1_put;
2865         c->idct_add= ff_jref_idct1_add;
2866         c->idct    = j_rev_dct1;
2867         c->idct_permutation_type= FF_NO_IDCT_PERM;
2868     }else{
2869         if (avctx->bits_per_raw_sample == 10) {
2870             c->idct_put              = ff_simple_idct_put_10;
2871             c->idct_add              = ff_simple_idct_add_10;
2872             c->idct                  = ff_simple_idct_10;
2873             c->idct_permutation_type = FF_NO_IDCT_PERM;
2874         } else {
2875         if(avctx->idct_algo==FF_IDCT_INT){
2876             c->idct_put= ff_jref_idct_put;
2877             c->idct_add= ff_jref_idct_add;
2878             c->idct    = j_rev_dct;
2879             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2880         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2881                 avctx->idct_algo==FF_IDCT_VP3){
2882             c->idct_put= ff_vp3_idct_put_c;
2883             c->idct_add= ff_vp3_idct_add_c;
2884             c->idct    = ff_vp3_idct_c;
2885             c->idct_permutation_type= FF_NO_IDCT_PERM;
2886         }else if(avctx->idct_algo==FF_IDCT_WMV2){
2887             c->idct_put= ff_wmv2_idct_put_c;
2888             c->idct_add= ff_wmv2_idct_add_c;
2889             c->idct    = ff_wmv2_idct_c;
2890             c->idct_permutation_type= FF_NO_IDCT_PERM;
2891         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2892             c->idct_put= ff_faanidct_put;
2893             c->idct_add= ff_faanidct_add;
2894             c->idct    = ff_faanidct;
2895             c->idct_permutation_type= FF_NO_IDCT_PERM;
2896         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2897             c->idct_put= ff_ea_idct_put_c;
2898             c->idct_permutation_type= FF_NO_IDCT_PERM;
2899         }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
2900             c->idct     = ff_bink_idct_c;
2901             c->idct_add = ff_bink_idct_add_c;
2902             c->idct_put = ff_bink_idct_put_c;
2903             c->idct_permutation_type = FF_NO_IDCT_PERM;
2904         }else{ //accurate/default
2905             c->idct_put = ff_simple_idct_put_8;
2906             c->idct_add = ff_simple_idct_add_8;
2907             c->idct     = ff_simple_idct_8;
2908             c->idct_permutation_type= FF_NO_IDCT_PERM;
2909         }
2910         }
2911     }
2912
2913     c->get_pixels = get_pixels_c;
2914     c->diff_pixels = diff_pixels_c;
2915     c->put_pixels_clamped = ff_put_pixels_clamped_c;
2916     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2917     c->put_pixels_nonclamped = put_pixels_nonclamped_c;
2918     c->add_pixels_clamped = ff_add_pixels_clamped_c;
2919     c->sum_abs_dctelem = sum_abs_dctelem_c;
2920     c->gmc1 = gmc1_c;
2921     c->gmc = ff_gmc_c;
2922     c->pix_sum = pix_sum_c;
2923     c->pix_norm1 = pix_norm1_c;
2924
2925     c->fill_block_tab[0] = fill_block16_c;
2926     c->fill_block_tab[1] = fill_block8_c;
2927     c->scale_block = scale_block_c;
2928
2929     /* TODO [0] 16  [1] 8 */
2930     c->pix_abs[0][0] = pix_abs16_c;
2931     c->pix_abs[0][1] = pix_abs16_x2_c;
2932     c->pix_abs[0][2] = pix_abs16_y2_c;
2933     c->pix_abs[0][3] = pix_abs16_xy2_c;
2934     c->pix_abs[1][0] = pix_abs8_c;
2935     c->pix_abs[1][1] = pix_abs8_x2_c;
2936     c->pix_abs[1][2] = pix_abs8_y2_c;
2937     c->pix_abs[1][3] = pix_abs8_xy2_c;
2938
2939     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2940     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2941     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2942     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2943     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2944     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2945     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2946     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2947     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2948
2949     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2950     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2951     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2952     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2953     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2954     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2955     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2956     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2957     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2958
2959 #define dspfunc(PFX, IDX, NUM) \
2960     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2961     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2962     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2963     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2964     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2965     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2966     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2967     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2968     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2969     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2970     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2971     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2972     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2973     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2974     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2975     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2976
2977     dspfunc(put_qpel, 0, 16);
2978     dspfunc(put_no_rnd_qpel, 0, 16);
2979
2980     dspfunc(avg_qpel, 0, 16);
2981     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2982
2983     dspfunc(put_qpel, 1, 8);
2984     dspfunc(put_no_rnd_qpel, 1, 8);
2985
2986     dspfunc(avg_qpel, 1, 8);
2987     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2988
2989 #undef dspfunc
2990
2991 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2992     ff_mlp_init(c, avctx);
2993 #endif
2994 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2995     ff_intrax8dsp_init(c,avctx);
2996 #endif
2997 #if CONFIG_RV30_DECODER
2998     ff_rv30dsp_init(c,avctx);
2999 #endif
3000 #if CONFIG_RV40_DECODER
3001     ff_rv40dsp_init(c,avctx);
3002     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
3003     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
3004     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
3005     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
3006 #endif
3007
3008     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
3009     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3010     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3011     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3012     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3013     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3014     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3015     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3016
3017 #define SET_CMP_FUNC(name) \
3018     c->name[0]= name ## 16_c;\
3019     c->name[1]= name ## 8x8_c;
3020
3021     SET_CMP_FUNC(hadamard8_diff)
3022     c->hadamard8_diff[4]= hadamard8_intra16_c;
3023     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3024     SET_CMP_FUNC(dct_sad)
3025     SET_CMP_FUNC(dct_max)
3026 #if CONFIG_GPL
3027     SET_CMP_FUNC(dct264_sad)
3028 #endif
3029     c->sad[0]= pix_abs16_c;
3030     c->sad[1]= pix_abs8_c;
3031     c->sse[0]= sse16_c;
3032     c->sse[1]= sse8_c;
3033     c->sse[2]= sse4_c;
3034     SET_CMP_FUNC(quant_psnr)
3035     SET_CMP_FUNC(rd)
3036     SET_CMP_FUNC(bit)
3037     c->vsad[0]= vsad16_c;
3038     c->vsad[4]= vsad_intra16_c;
3039     c->vsad[5]= vsad_intra8_c;
3040     c->vsse[0]= vsse16_c;
3041     c->vsse[4]= vsse_intra16_c;
3042     c->vsse[5]= vsse_intra8_c;
3043     c->nsse[0]= nsse16_c;
3044     c->nsse[1]= nsse8_c;
3045 #if CONFIG_DWT
3046     ff_dsputil_init_dwt(c);
3047 #endif
3048
3049     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3050
3051     c->add_bytes= add_bytes_c;
3052     c->diff_bytes= diff_bytes_c;
3053     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3054     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3055     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
3056     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3057     c->bswap_buf= bswap_buf;
3058     c->bswap16_buf = bswap16_buf;
3059
3060     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3061         c->h263_h_loop_filter= h263_h_loop_filter_c;
3062         c->h263_v_loop_filter= h263_v_loop_filter_c;
3063     }
3064
3065     if (CONFIG_VP3_DECODER) {
3066         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3067         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3068         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3069     }
3070
3071     c->h261_loop_filter= h261_loop_filter_c;
3072
3073     c->try_8x8basis= try_8x8basis_c;
3074     c->add_8x8basis= add_8x8basis_c;
3075
3076 #if CONFIG_VORBIS_DECODER
3077     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3078 #endif
3079 #if CONFIG_AC3_DECODER
3080     c->ac3_downmix = ff_ac3_downmix_c;
3081 #endif
3082     c->vector_fmul = vector_fmul_c;
3083     c->vector_fmul_reverse = vector_fmul_reverse_c;
3084     c->vector_fmul_add = vector_fmul_add_c;
3085     c->vector_fmul_window = vector_fmul_window_c;
3086     c->vector_clipf = vector_clipf_c;
3087     c->scalarproduct_int16 = scalarproduct_int16_c;
3088     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3089     c->apply_window_int16 = apply_window_int16_c;
3090     c->vector_clip_int32 = vector_clip_int32_c;
3091     c->scalarproduct_float = scalarproduct_float_c;
3092     c->butterflies_float = butterflies_float_c;
3093     c->vector_fmul_scalar = vector_fmul_scalar_c;
3094
3095     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
3096     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
3097
3098     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
3099     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
3100
3101     c->shrink[0]= av_image_copy_plane;
3102     c->shrink[1]= ff_shrink22;
3103     c->shrink[2]= ff_shrink44;
3104     c->shrink[3]= ff_shrink88;
3105
3106     c->prefetch= just_return;
3107
3108     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3109     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3110
3111 #undef FUNC
3112 #undef FUNCC
3113 #define FUNC(f, depth) f ## _ ## depth
3114 #define FUNCC(f, depth) f ## _ ## depth ## _c
3115
3116 #define dspfunc1(PFX, IDX, NUM, depth)\
3117     c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
3118     c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3119     c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3120     c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3121
3122 #define dspfunc2(PFX, IDX, NUM, depth)\
3123     c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3124     c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3125     c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3126     c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3127     c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3128     c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3129     c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3130     c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3131     c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3132     c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3133     c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3134     c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3135     c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3136     c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3137     c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3138     c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3139
3140
3141 #define BIT_DEPTH_FUNCS(depth)\
3142     c->draw_edges                    = FUNCC(draw_edges            , depth);\
3143     c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
3144     c->clear_block                   = FUNCC(clear_block           , depth);\
3145     c->clear_blocks                  = FUNCC(clear_blocks          , depth);\
3146     c->add_pixels8                   = FUNCC(add_pixels8           , depth);\
3147     c->add_pixels4                   = FUNCC(add_pixels4           , depth);\
3148     c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
3149     c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3150 \
3151     c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
3152     c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
3153     c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
3154     c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
3155     c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
3156     c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
3157 \
3158     dspfunc1(put       , 0, 16, depth);\
3159     dspfunc1(put       , 1,  8, depth);\
3160     dspfunc1(put       , 2,  4, depth);\
3161     dspfunc1(put       , 3,  2, depth);\
3162     dspfunc1(put_no_rnd, 0, 16, depth);\
3163     dspfunc1(put_no_rnd, 1,  8, depth);\
3164     dspfunc1(avg       , 0, 16, depth);\
3165     dspfunc1(avg       , 1,  8, depth);\
3166     dspfunc1(avg       , 2,  4, depth);\
3167     dspfunc1(avg       , 3,  2, depth);\
3168     dspfunc1(avg_no_rnd, 0, 16, depth);\
3169     dspfunc1(avg_no_rnd, 1,  8, depth);\
3170 \
3171     dspfunc2(put_h264_qpel, 0, 16, depth);\
3172     dspfunc2(put_h264_qpel, 1,  8, depth);\
3173     dspfunc2(put_h264_qpel, 2,  4, depth);\
3174     dspfunc2(put_h264_qpel, 3,  2, depth);\
3175     dspfunc2(avg_h264_qpel, 0, 16, depth);\
3176     dspfunc2(avg_h264_qpel, 1,  8, depth);\
3177     dspfunc2(avg_h264_qpel, 2,  4, depth);
3178
3179     switch (avctx->bits_per_raw_sample) {
3180     case 9:
3181         BIT_DEPTH_FUNCS(9);
3182         break;
3183     case 10:
3184         BIT_DEPTH_FUNCS(10);
3185         break;
3186     default:
3187         av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3188     case 8:
3189         BIT_DEPTH_FUNCS(8);
3190         break;
3191     }
3192
3193
3194     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
3195     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
3196     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
3197     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
3198     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
3199     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
3200     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
3201     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
3202     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
3203
3204     for(i=0; i<64; i++){
3205         if(!c->put_2tap_qpel_pixels_tab[0][i])
3206             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3207         if(!c->avg_2tap_qpel_pixels_tab[0][i])
3208             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3209     }
3210
3211     c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3212     c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3213     c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3214     c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3215
3216     c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3217     c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3218     c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3219     c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3220
3221     switch(c->idct_permutation_type){
3222     case FF_NO_IDCT_PERM:
3223         for(i=0; i<64; i++)
3224             c->idct_permutation[i]= i;
3225         break;
3226     case FF_LIBMPEG2_IDCT_PERM:
3227         for(i=0; i<64; i++)
3228             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3229         break;
3230     case FF_SIMPLE_IDCT_PERM:
3231         for(i=0; i<64; i++)
3232             c->idct_permutation[i]= simple_mmx_permutation[i];
3233         break;
3234     case FF_TRANSPOSE_IDCT_PERM:
3235         for(i=0; i<64; i++)
3236             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3237         break;
3238     case FF_PARTTRANS_IDCT_PERM:
3239         for(i=0; i<64; i++)
3240             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3241         break;
3242     case FF_SSE2_IDCT_PERM:
3243         for(i=0; i<64; i++)
3244             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3245         break;
3246     default:
3247         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3248     }
3249 }