git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41 #include "png.h"
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 #define BIT_DEPTH 9
  47 #include "dsputil_template.c"
  48 #undef BIT_DEPTH
  49
  50 #define BIT_DEPTH 10
  51 #include "dsputil_template.c"
  52 #undef BIT_DEPTH
  53
  54 #define BIT_DEPTH 8
  55 #include "dsputil_template.c"
  56
  57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  58 #define pb_7f (~0UL/255 * 0x7f)
  59 #define pb_80 (~0UL/255 * 0x80)
  60
  61 const uint8_t ff_zigzag_direct[64] = {
  62     0,   1,  8, 16,  9,  2,  3, 10,
  63     17, 24, 32, 25, 18, 11,  4,  5,
  64     12, 19, 26, 33, 40, 48, 41, 34,
  65     27, 20, 13,  6,  7, 14, 21, 28,
  66     35, 42, 49, 56, 57, 50, 43, 36,
  67     29, 22, 15, 23, 30, 37, 44, 51,
  68     58, 59, 52, 45, 38, 31, 39, 46,
  69     53, 60, 61, 54, 47, 55, 62, 63
  70 };
  71
  72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  73    specification, we interleave the fields */
  74 const uint8_t ff_zigzag248_direct[64] = {
  75      0,  8,  1,  9, 16, 24,  2, 10,
  76     17, 25, 32, 40, 48, 56, 33, 41,
  77     18, 26,  3, 11,  4, 12, 19, 27,
  78     34, 42, 49, 57, 50, 58, 35, 43,
  79     20, 28,  5, 13,  6, 14, 21, 29,
  80     36, 44, 51, 59, 52, 60, 37, 45,
  81     22, 30,  7, 15, 23, 31, 38, 46,
  82     53, 61, 54, 62, 39, 47, 55, 63,
  83 };
  84
  85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  86 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  87
  88 const uint8_t ff_alternate_horizontal_scan[64] = {
  89     0,  1,   2,  3,  8,  9, 16, 17,
  90     10, 11,  4,  5,  6,  7, 15, 14,
  91     13, 12, 19, 18, 24, 25, 32, 33,
  92     26, 27, 20, 21, 22, 23, 28, 29,
  93     30, 31, 34, 35, 40, 41, 48, 49,
  94     42, 43, 36, 37, 38, 39, 44, 45,
  95     46, 47, 50, 51, 56, 57, 58, 59,
  96     52, 53, 54, 55, 60, 61, 62, 63,
  97 };
  98
  99 const uint8_t ff_alternate_vertical_scan[64] = {
 100     0,  8,  16, 24,  1,  9,  2, 10,
 101     17, 25, 32, 40, 48, 56, 57, 49,
 102     41, 33, 26, 18,  3, 11,  4, 12,
 103     19, 27, 34, 42, 50, 58, 35, 43,
 104     51, 59, 20, 28,  5, 13,  6, 14,
 105     21, 29, 36, 44, 52, 60, 37, 45,
 106     53, 61, 22, 30,  7, 15, 23, 31,
 107     38, 46, 54, 62, 39, 47, 55, 63,
 108 };
 109
 110 /* Input permutation for the simple_idct_mmx */
 111 static const uint8_t simple_mmx_permutation[64]={
 112         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 113         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 114         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 115         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 116         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 117         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 118         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 119         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 120 };
 121
 122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 123
 124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 125     int i;
 126     int end;
 127
 128     st->scantable= src_scantable;
 129
 130     for(i=0; i<64; i++){
 131         int j;
 132         j = src_scantable[i];
 133         st->permutated[i] = permutation[j];
 134 #if ARCH_PPC
 135         st->inverse[j] = i;
 136 #endif
 137     }
 138
 139     end=-1;
 140     for(i=0; i<64; i++){
 141         int j;
 142         j = st->permutated[i];
 143         if(j>end) end=j;
 144         st->raster_end[i]= end;
 145     }
 146 }
 147
 148 static int pix_sum_c(uint8_t * pix, int line_size)
 149 {
 150     int s, i, j;
 151
 152     s = 0;
 153     for (i = 0; i < 16; i++) {
 154         for (j = 0; j < 16; j += 8) {
 155             s += pix[0];
 156             s += pix[1];
 157             s += pix[2];
 158             s += pix[3];
 159             s += pix[4];
 160             s += pix[5];
 161             s += pix[6];
 162             s += pix[7];
 163             pix += 8;
 164         }
 165         pix += line_size - 16;
 166     }
 167     return s;
 168 }
 169
 170 static int pix_norm1_c(uint8_t * pix, int line_size)
 171 {
 172     int s, i, j;
 173     uint32_t *sq = ff_squareTbl + 256;
 174
 175     s = 0;
 176     for (i = 0; i < 16; i++) {
 177         for (j = 0; j < 16; j += 8) {
 178 #if 0
 179             s += sq[pix[0]];
 180             s += sq[pix[1]];
 181             s += sq[pix[2]];
 182             s += sq[pix[3]];
 183             s += sq[pix[4]];
 184             s += sq[pix[5]];
 185             s += sq[pix[6]];
 186             s += sq[pix[7]];
 187 #else
 188 #if LONG_MAX > 2147483647
 189             register uint64_t x=*(uint64_t*)pix;
 190             s += sq[x&0xff];
 191             s += sq[(x>>8)&0xff];
 192             s += sq[(x>>16)&0xff];
 193             s += sq[(x>>24)&0xff];
 194             s += sq[(x>>32)&0xff];
 195             s += sq[(x>>40)&0xff];
 196             s += sq[(x>>48)&0xff];
 197             s += sq[(x>>56)&0xff];
 198 #else
 199             register uint32_t x=*(uint32_t*)pix;
 200             s += sq[x&0xff];
 201             s += sq[(x>>8)&0xff];
 202             s += sq[(x>>16)&0xff];
 203             s += sq[(x>>24)&0xff];
 204             x=*(uint32_t*)(pix+4);
 205             s += sq[x&0xff];
 206             s += sq[(x>>8)&0xff];
 207             s += sq[(x>>16)&0xff];
 208             s += sq[(x>>24)&0xff];
 209 #endif
 210 #endif
 211             pix += 8;
 212         }
 213         pix += line_size - 16;
 214     }
 215     return s;
 216 }
 217
 218 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 219     int i;
 220
 221     for(i=0; i+8<=w; i+=8){
 222         dst[i+0]= av_bswap32(src[i+0]);
 223         dst[i+1]= av_bswap32(src[i+1]);
 224         dst[i+2]= av_bswap32(src[i+2]);
 225         dst[i+3]= av_bswap32(src[i+3]);
 226         dst[i+4]= av_bswap32(src[i+4]);
 227         dst[i+5]= av_bswap32(src[i+5]);
 228         dst[i+6]= av_bswap32(src[i+6]);
 229         dst[i+7]= av_bswap32(src[i+7]);
 230     }
 231     for(;i<w; i++){
 232         dst[i+0]= av_bswap32(src[i+0]);
 233     }
 234 }
 235
 236 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 237 {
 238     while (len--)
 239         *dst++ = av_bswap16(*src++);
 240 }
 241
 242 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 243 {
 244     int s, i;
 245     uint32_t *sq = ff_squareTbl + 256;
 246
 247     s = 0;
 248     for (i = 0; i < h; i++) {
 249         s += sq[pix1[0] - pix2[0]];
 250         s += sq[pix1[1] - pix2[1]];
 251         s += sq[pix1[2] - pix2[2]];
 252         s += sq[pix1[3] - pix2[3]];
 253         pix1 += line_size;
 254         pix2 += line_size;
 255     }
 256     return s;
 257 }
 258
 259 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 260 {
 261     int s, i;
 262     uint32_t *sq = ff_squareTbl + 256;
 263
 264     s = 0;
 265     for (i = 0; i < h; i++) {
 266         s += sq[pix1[0] - pix2[0]];
 267         s += sq[pix1[1] - pix2[1]];
 268         s += sq[pix1[2] - pix2[2]];
 269         s += sq[pix1[3] - pix2[3]];
 270         s += sq[pix1[4] - pix2[4]];
 271         s += sq[pix1[5] - pix2[5]];
 272         s += sq[pix1[6] - pix2[6]];
 273         s += sq[pix1[7] - pix2[7]];
 274         pix1 += line_size;
 275         pix2 += line_size;
 276     }
 277     return s;
 278 }
 279
 280 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 281 {
 282     int s, i;
 283     uint32_t *sq = ff_squareTbl + 256;
 284
 285     s = 0;
 286     for (i = 0; i < h; i++) {
 287         s += sq[pix1[ 0] - pix2[ 0]];
 288         s += sq[pix1[ 1] - pix2[ 1]];
 289         s += sq[pix1[ 2] - pix2[ 2]];
 290         s += sq[pix1[ 3] - pix2[ 3]];
 291         s += sq[pix1[ 4] - pix2[ 4]];
 292         s += sq[pix1[ 5] - pix2[ 5]];
 293         s += sq[pix1[ 6] - pix2[ 6]];
 294         s += sq[pix1[ 7] - pix2[ 7]];
 295         s += sq[pix1[ 8] - pix2[ 8]];
 296         s += sq[pix1[ 9] - pix2[ 9]];
 297         s += sq[pix1[10] - pix2[10]];
 298         s += sq[pix1[11] - pix2[11]];
 299         s += sq[pix1[12] - pix2[12]];
 300         s += sq[pix1[13] - pix2[13]];
 301         s += sq[pix1[14] - pix2[14]];
 302         s += sq[pix1[15] - pix2[15]];
 303
 304         pix1 += line_size;
 305         pix2 += line_size;
 306     }
 307     return s;
 308 }
 309
 310 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 311 {
 312     int i;
 313
 314     /* read the pixels */
 315     for(i=0;i<8;i++) {
 316         block[0] = pixels[0];
 317         block[1] = pixels[1];
 318         block[2] = pixels[2];
 319         block[3] = pixels[3];
 320         block[4] = pixels[4];
 321         block[5] = pixels[5];
 322         block[6] = pixels[6];
 323         block[7] = pixels[7];
 324         pixels += line_size;
 325         block += 8;
 326     }
 327 }
 328
 329 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 330                           const uint8_t *s2, int stride){
 331     int i;
 332
 333     /* read the pixels */
 334     for(i=0;i<8;i++) {
 335         block[0] = s1[0] - s2[0];
 336         block[1] = s1[1] - s2[1];
 337         block[2] = s1[2] - s2[2];
 338         block[3] = s1[3] - s2[3];
 339         block[4] = s1[4] - s2[4];
 340         block[5] = s1[5] - s2[5];
 341         block[6] = s1[6] - s2[6];
 342         block[7] = s1[7] - s2[7];
 343         s1 += stride;
 344         s2 += stride;
 345         block += 8;
 346     }
 347 }
 348
 349
 350 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 351                              int line_size)
 352 {
 353     int i;
 354     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 355
 356     /* read the pixels */
 357     for(i=0;i<8;i++) {
 358         pixels[0] = cm[block[0]];
 359         pixels[1] = cm[block[1]];
 360         pixels[2] = cm[block[2]];
 361         pixels[3] = cm[block[3]];
 362         pixels[4] = cm[block[4]];
 363         pixels[5] = cm[block[5]];
 364         pixels[6] = cm[block[6]];
 365         pixels[7] = cm[block[7]];
 366
 367         pixels += line_size;
 368         block += 8;
 369     }
 370 }
 371
 372 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 373                                  int line_size)
 374 {
 375     int i;
 376     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 377
 378     /* read the pixels */
 379     for(i=0;i<4;i++) {
 380         pixels[0] = cm[block[0]];
 381         pixels[1] = cm[block[1]];
 382         pixels[2] = cm[block[2]];
 383         pixels[3] = cm[block[3]];
 384
 385         pixels += line_size;
 386         block += 8;
 387     }
 388 }
 389
 390 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 391                                  int line_size)
 392 {
 393     int i;
 394     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 395
 396     /* read the pixels */
 397     for(i=0;i<2;i++) {
 398         pixels[0] = cm[block[0]];
 399         pixels[1] = cm[block[1]];
 400
 401         pixels += line_size;
 402         block += 8;
 403     }
 404 }
 405
 406 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
 407                                     uint8_t *restrict pixels,
 408                                     int line_size)
 409 {
 410     int i, j;
 411
 412     for (i = 0; i < 8; i++) {
 413         for (j = 0; j < 8; j++) {
 414             if (*block < -128)
 415                 *pixels = 0;
 416             else if (*block > 127)
 417                 *pixels = 255;
 418             else
 419                 *pixels = (uint8_t)(*block + 128);
 420             block++;
 421             pixels++;
 422         }
 423         pixels += (line_size - 8);
 424     }
 425 }
 426
 427 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 428                                     int line_size)
 429 {
 430     int i;
 431
 432     /* read the pixels */
 433     for(i=0;i<8;i++) {
 434         pixels[0] = block[0];
 435         pixels[1] = block[1];
 436         pixels[2] = block[2];
 437         pixels[3] = block[3];
 438         pixels[4] = block[4];
 439         pixels[5] = block[5];
 440         pixels[6] = block[6];
 441         pixels[7] = block[7];
 442
 443         pixels += line_size;
 444         block += 8;
 445     }
 446 }
 447
 448 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 449                              int line_size)
 450 {
 451     int i;
 452     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 453
 454     /* read the pixels */
 455     for(i=0;i<8;i++) {
 456         pixels[0] = cm[pixels[0] + block[0]];
 457         pixels[1] = cm[pixels[1] + block[1]];
 458         pixels[2] = cm[pixels[2] + block[2]];
 459         pixels[3] = cm[pixels[3] + block[3]];
 460         pixels[4] = cm[pixels[4] + block[4]];
 461         pixels[5] = cm[pixels[5] + block[5]];
 462         pixels[6] = cm[pixels[6] + block[6]];
 463         pixels[7] = cm[pixels[7] + block[7]];
 464         pixels += line_size;
 465         block += 8;
 466     }
 467 }
 468
 469 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 470                           int line_size)
 471 {
 472     int i;
 473     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 474
 475     /* read the pixels */
 476     for(i=0;i<4;i++) {
 477         pixels[0] = cm[pixels[0] + block[0]];
 478         pixels[1] = cm[pixels[1] + block[1]];
 479         pixels[2] = cm[pixels[2] + block[2]];
 480         pixels[3] = cm[pixels[3] + block[3]];
 481         pixels += line_size;
 482         block += 8;
 483     }
 484 }
 485
 486 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 487                           int line_size)
 488 {
 489     int i;
 490     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 491
 492     /* read the pixels */
 493     for(i=0;i<2;i++) {
 494         pixels[0] = cm[pixels[0] + block[0]];
 495         pixels[1] = cm[pixels[1] + block[1]];
 496         pixels += line_size;
 497         block += 8;
 498     }
 499 }
 500
 501 static int sum_abs_dctelem_c(DCTELEM *block)
 502 {
 503     int sum=0, i;
 504     for(i=0; i<64; i++)
 505         sum+= FFABS(block[i]);
 506     return sum;
 507 }
 508
 509 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 510 {
 511     int i;
 512
 513     for (i = 0; i < h; i++) {
 514         memset(block, value, 16);
 515         block += line_size;
 516     }
 517 }
 518
 519 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 520 {
 521     int i;
 522
 523     for (i = 0; i < h; i++) {
 524         memset(block, value, 8);
 525         block += line_size;
 526     }
 527 }
 528
 529 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
 530 {
 531     int i, j;
 532     uint16_t *dst1 = (uint16_t *) dst;
 533     uint16_t *dst2 = (uint16_t *)(dst + linesize);
 534
 535     for (j = 0; j < 8; j++) {
 536         for (i = 0; i < 8; i++) {
 537             dst1[i] = dst2[i] = src[i] * 0x0101;
 538         }
 539         src  += 8;
 540         dst1 += linesize;
 541         dst2 += linesize;
 542     }
 543 }
 544
 545 #define avg2(a,b) ((a+b+1)>>1)
 546 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 547
 548 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
 549 {
 550     const int A=(16-x16)*(16-y16);
 551     const int B=(   x16)*(16-y16);
 552     const int C=(16-x16)*(   y16);
 553     const int D=(   x16)*(   y16);
 554     int i;
 555
 556     for(i=0; i<h; i++)
 557     {
 558         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
 559         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
 560         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
 561         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
 562         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
 563         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
 564         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
 565         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
 566         dst+= stride;
 567         src+= stride;
 568     }
 569 }
 570
 571 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
 572                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 573 {
 574     int y, vx, vy;
 575     const int s= 1<<shift;
 576
 577     width--;
 578     height--;
 579
 580     for(y=0; y<h; y++){
 581         int x;
 582
 583         vx= ox;
 584         vy= oy;
 585         for(x=0; x<8; x++){ //XXX FIXME optimize
 586             int src_x, src_y, frac_x, frac_y, index;
 587
 588             src_x= vx>>16;
 589             src_y= vy>>16;
 590             frac_x= src_x&(s-1);
 591             frac_y= src_y&(s-1);
 592             src_x>>=shift;
 593             src_y>>=shift;
 594
 595             if((unsigned)src_x < width){
 596                 if((unsigned)src_y < height){
 597                     index= src_x + src_y*stride;
 598                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
 599                                            + src[index       +1]*   frac_x )*(s-frac_y)
 600                                         + (  src[index+stride  ]*(s-frac_x)
 601                                            + src[index+stride+1]*   frac_x )*   frac_y
 602                                         + r)>>(shift*2);
 603                 }else{
 604                     index= src_x + av_clip(src_y, 0, height)*stride;
 605                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
 606                                           + src[index       +1]*   frac_x )*s
 607                                         + r)>>(shift*2);
 608                 }
 609             }else{
 610                 if((unsigned)src_y < height){
 611                     index= av_clip(src_x, 0, width) + src_y*stride;
 612                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
 613                                            + src[index+stride  ]*   frac_y )*s
 614                                         + r)>>(shift*2);
 615                 }else{
 616                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
 617                     dst[y*stride + x]=    src[index         ];
 618                 }
 619             }
 620
 621             vx+= dxx;
 622             vy+= dyx;
 623         }
 624         ox += dxy;
 625         oy += dyy;
 626     }
 627 }
 628
 629 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 630     switch(width){
 631     case 2: put_pixels2_8_c (dst, src, stride, height); break;
 632     case 4: put_pixels4_8_c (dst, src, stride, height); break;
 633     case 8: put_pixels8_8_c (dst, src, stride, height); break;
 634     case 16:put_pixels16_8_c(dst, src, stride, height); break;
 635     }
 636 }
 637
 638 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 639     int i,j;
 640     for (i=0; i < height; i++) {
 641       for (j=0; j < width; j++) {
 642         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
 643       }
 644       src += stride;
 645       dst += stride;
 646     }
 647 }
 648
 649 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 650     int i,j;
 651     for (i=0; i < height; i++) {
 652       for (j=0; j < width; j++) {
 653         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
 654       }
 655       src += stride;
 656       dst += stride;
 657     }
 658 }
 659
 660 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 661     int i,j;
 662     for (i=0; i < height; i++) {
 663       for (j=0; j < width; j++) {
 664         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
 665       }
 666       src += stride;
 667       dst += stride;
 668     }
 669 }
 670
 671 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 672     int i,j;
 673     for (i=0; i < height; i++) {
 674       for (j=0; j < width; j++) {
 675         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
 676       }
 677       src += stride;
 678       dst += stride;
 679     }
 680 }
 681
 682 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 683     int i,j;
 684     for (i=0; i < height; i++) {
 685       for (j=0; j < width; j++) {
 686         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 687       }
 688       src += stride;
 689       dst += stride;
 690     }
 691 }
 692
 693 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 694     int i,j;
 695     for (i=0; i < height; i++) {
 696       for (j=0; j < width; j++) {
 697         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
 698       }
 699       src += stride;
 700       dst += stride;
 701     }
 702 }
 703
 704 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 705     int i,j;
 706     for (i=0; i < height; i++) {
 707       for (j=0; j < width; j++) {
 708         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
 709       }
 710       src += stride;
 711       dst += stride;
 712     }
 713 }
 714
 715 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 716     int i,j;
 717     for (i=0; i < height; i++) {
 718       for (j=0; j < width; j++) {
 719         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
 720       }
 721       src += stride;
 722       dst += stride;
 723     }
 724 }
 725
 726 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 727     switch(width){
 728     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
 729     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
 730     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
 731     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
 732     }
 733 }
 734
 735 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 736     int i,j;
 737     for (i=0; i < height; i++) {
 738       for (j=0; j < width; j++) {
 739         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
 740       }
 741       src += stride;
 742       dst += stride;
 743     }
 744 }
 745
 746 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 747     int i,j;
 748     for (i=0; i < height; i++) {
 749       for (j=0; j < width; j++) {
 750         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
 751       }
 752       src += stride;
 753       dst += stride;
 754     }
 755 }
 756
 757 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 758     int i,j;
 759     for (i=0; i < height; i++) {
 760       for (j=0; j < width; j++) {
 761         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
 762       }
 763       src += stride;
 764       dst += stride;
 765     }
 766 }
 767
 768 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 769     int i,j;
 770     for (i=0; i < height; i++) {
 771       for (j=0; j < width; j++) {
 772         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 773       }
 774       src += stride;
 775       dst += stride;
 776     }
 777 }
 778
 779 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 780     int i,j;
 781     for (i=0; i < height; i++) {
 782       for (j=0; j < width; j++) {
 783         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 784       }
 785       src += stride;
 786       dst += stride;
 787     }
 788 }
 789
 790 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 791     int i,j;
 792     for (i=0; i < height; i++) {
 793       for (j=0; j < width; j++) {
 794         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
 795       }
 796       src += stride;
 797       dst += stride;
 798     }
 799 }
 800
 801 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 802     int i,j;
 803     for (i=0; i < height; i++) {
 804       for (j=0; j < width; j++) {
 805         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 806       }
 807       src += stride;
 808       dst += stride;
 809     }
 810 }
 811
 812 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
 813     int i,j;
 814     for (i=0; i < height; i++) {
 815       for (j=0; j < width; j++) {
 816         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
 817       }
 818       src += stride;
 819       dst += stride;
 820     }
 821 }
 822 #if 0
 823 #define TPEL_WIDTH(width)\
 824 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
 825     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
 826 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
 827     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
 828 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
 829     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
 830 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
 831     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
 832 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
 833     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
 834 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
 835     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
 836 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
 837     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
 838 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
 839     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
 840 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
 841     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
 842 #endif
 843
 844 #define QPEL_MC(r, OPNAME, RND, OP) \
 845 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 846     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 847     int i;\
 848     for(i=0; i<h; i++)\
 849     {\
 850         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
 851         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
 852         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
 853         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
 854         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
 855         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
 856         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
 857         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
 858         dst+=dstStride;\
 859         src+=srcStride;\
 860     }\
 861 }\
 862 \
 863 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 864     const int w=8;\
 865     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 866     int i;\
 867     for(i=0; i<w; i++)\
 868     {\
 869         const int src0= src[0*srcStride];\
 870         const int src1= src[1*srcStride];\
 871         const int src2= src[2*srcStride];\
 872         const int src3= src[3*srcStride];\
 873         const int src4= src[4*srcStride];\
 874         const int src5= src[5*srcStride];\
 875         const int src6= src[6*srcStride];\
 876         const int src7= src[7*srcStride];\
 877         const int src8= src[8*srcStride];\
 878         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
 879         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
 880         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
 881         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
 882         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
 883         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
 884         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
 885         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
 886         dst++;\
 887         src++;\
 888     }\
 889 }\
 890 \
 891 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
 892     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 893     int i;\
 894     \
 895     for(i=0; i<h; i++)\
 896     {\
 897         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
 898         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
 899         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
 900         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
 901         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
 902         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
 903         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
 904         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
 905         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
 906         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
 907         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
 908         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
 909         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
 910         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
 911         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
 912         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
 913         dst+=dstStride;\
 914         src+=srcStride;\
 915     }\
 916 }\
 917 \
 918 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
 919     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
 920     int i;\
 921     const int w=16;\
 922     for(i=0; i<w; i++)\
 923     {\
 924         const int src0= src[0*srcStride];\
 925         const int src1= src[1*srcStride];\
 926         const int src2= src[2*srcStride];\
 927         const int src3= src[3*srcStride];\
 928         const int src4= src[4*srcStride];\
 929         const int src5= src[5*srcStride];\
 930         const int src6= src[6*srcStride];\
 931         const int src7= src[7*srcStride];\
 932         const int src8= src[8*srcStride];\
 933         const int src9= src[9*srcStride];\
 934         const int src10= src[10*srcStride];\
 935         const int src11= src[11*srcStride];\
 936         const int src12= src[12*srcStride];\
 937         const int src13= src[13*srcStride];\
 938         const int src14= src[14*srcStride];\
 939         const int src15= src[15*srcStride];\
 940         const int src16= src[16*srcStride];\
 941         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
 942         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
 943         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
 944         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
 945         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
 946         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
 947         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
 948         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
 949         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
 950         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
 951         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
 952         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
 953         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
 954         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
 955         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
 956         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
 957         dst++;\
 958         src++;\
 959     }\
 960 }\
 961 \
 962 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
 963     uint8_t half[64];\
 964     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 965     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
 966 }\
 967 \
 968 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
 969     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 970 }\
 971 \
 972 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
 973     uint8_t half[64];\
 974     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
 975     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
 976 }\
 977 \
 978 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
 979     uint8_t full[16*9];\
 980     uint8_t half[64];\
 981     copy_block9(full, src, 16, stride, 9);\
 982     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 983     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
 984 }\
 985 \
 986 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
 987     uint8_t full[16*9];\
 988     copy_block9(full, src, 16, stride, 9);\
 989     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 990 }\
 991 \
 992 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
 993     uint8_t full[16*9];\
 994     uint8_t half[64];\
 995     copy_block9(full, src, 16, stride, 9);\
 996     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
 997     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
 998 }\
 999 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1000     uint8_t full[16*9];\
1001     uint8_t halfH[72];\
1002     uint8_t halfV[64];\
1003     uint8_t halfHV[64];\
1004     copy_block9(full, src, 16, stride, 9);\
1005     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1006     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1007     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1008     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1009 }\
1010 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1011     uint8_t full[16*9];\
1012     uint8_t halfH[72];\
1013     uint8_t halfHV[64];\
1014     copy_block9(full, src, 16, stride, 9);\
1015     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1017     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1019 }\
1020 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1021     uint8_t full[16*9];\
1022     uint8_t halfH[72];\
1023     uint8_t halfV[64];\
1024     uint8_t halfHV[64];\
1025     copy_block9(full, src, 16, stride, 9);\
1026     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1027     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1028     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1029     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1030 }\
1031 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1032     uint8_t full[16*9];\
1033     uint8_t halfH[72];\
1034     uint8_t halfHV[64];\
1035     copy_block9(full, src, 16, stride, 9);\
1036     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1037     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1038     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1039     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1040 }\
1041 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1042     uint8_t full[16*9];\
1043     uint8_t halfH[72];\
1044     uint8_t halfV[64];\
1045     uint8_t halfHV[64];\
1046     copy_block9(full, src, 16, stride, 9);\
1047     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1048     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1049     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1050     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1051 }\
1052 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1053     uint8_t full[16*9];\
1054     uint8_t halfH[72];\
1055     uint8_t halfHV[64];\
1056     copy_block9(full, src, 16, stride, 9);\
1057     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1058     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1059     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1060     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1061 }\
1062 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1063     uint8_t full[16*9];\
1064     uint8_t halfH[72];\
1065     uint8_t halfV[64];\
1066     uint8_t halfHV[64];\
1067     copy_block9(full, src, 16, stride, 9);\
1068     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1069     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1070     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1071     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1072 }\
1073 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1074     uint8_t full[16*9];\
1075     uint8_t halfH[72];\
1076     uint8_t halfHV[64];\
1077     copy_block9(full, src, 16, stride, 9);\
1078     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1079     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1080     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1081     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1082 }\
1083 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1084     uint8_t halfH[72];\
1085     uint8_t halfHV[64];\
1086     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1087     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1088     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1089 }\
1090 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1091     uint8_t halfH[72];\
1092     uint8_t halfHV[64];\
1093     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1094     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1095     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1096 }\
1097 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1098     uint8_t full[16*9];\
1099     uint8_t halfH[72];\
1100     uint8_t halfV[64];\
1101     uint8_t halfHV[64];\
1102     copy_block9(full, src, 16, stride, 9);\
1103     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1104     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1105     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1106     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1107 }\
1108 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1109     uint8_t full[16*9];\
1110     uint8_t halfH[72];\
1111     copy_block9(full, src, 16, stride, 9);\
1112     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1113     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1114     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1115 }\
1116 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1117     uint8_t full[16*9];\
1118     uint8_t halfH[72];\
1119     uint8_t halfV[64];\
1120     uint8_t halfHV[64];\
1121     copy_block9(full, src, 16, stride, 9);\
1122     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1123     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1124     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1125     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1126 }\
1127 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1128     uint8_t full[16*9];\
1129     uint8_t halfH[72];\
1130     copy_block9(full, src, 16, stride, 9);\
1131     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1132     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1133     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1134 }\
1135 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1136     uint8_t halfH[72];\
1137     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1138     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1139 }\
1140 \
1141 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1142     uint8_t half[256];\
1143     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1144     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1145 }\
1146 \
1147 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1148     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1149 }\
1150 \
1151 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1152     uint8_t half[256];\
1153     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1154     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1155 }\
1156 \
1157 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1158     uint8_t full[24*17];\
1159     uint8_t half[256];\
1160     copy_block17(full, src, 24, stride, 17);\
1161     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1162     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1163 }\
1164 \
1165 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1166     uint8_t full[24*17];\
1167     copy_block17(full, src, 24, stride, 17);\
1168     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1169 }\
1170 \
1171 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1172     uint8_t full[24*17];\
1173     uint8_t half[256];\
1174     copy_block17(full, src, 24, stride, 17);\
1175     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1176     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1177 }\
1178 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1179     uint8_t full[24*17];\
1180     uint8_t halfH[272];\
1181     uint8_t halfV[256];\
1182     uint8_t halfHV[256];\
1183     copy_block17(full, src, 24, stride, 17);\
1184     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1185     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1186     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1187     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1188 }\
1189 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1190     uint8_t full[24*17];\
1191     uint8_t halfH[272];\
1192     uint8_t halfHV[256];\
1193     copy_block17(full, src, 24, stride, 17);\
1194     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1195     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1196     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1197     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1198 }\
1199 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1200     uint8_t full[24*17];\
1201     uint8_t halfH[272];\
1202     uint8_t halfV[256];\
1203     uint8_t halfHV[256];\
1204     copy_block17(full, src, 24, stride, 17);\
1205     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1206     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1207     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1208     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1209 }\
1210 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1211     uint8_t full[24*17];\
1212     uint8_t halfH[272];\
1213     uint8_t halfHV[256];\
1214     copy_block17(full, src, 24, stride, 17);\
1215     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1217     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1219 }\
1220 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1221     uint8_t full[24*17];\
1222     uint8_t halfH[272];\
1223     uint8_t halfV[256];\
1224     uint8_t halfHV[256];\
1225     copy_block17(full, src, 24, stride, 17);\
1226     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1227     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1228     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1229     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1230 }\
1231 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1232     uint8_t full[24*17];\
1233     uint8_t halfH[272];\
1234     uint8_t halfHV[256];\
1235     copy_block17(full, src, 24, stride, 17);\
1236     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1237     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1238     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1239     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1240 }\
1241 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1242     uint8_t full[24*17];\
1243     uint8_t halfH[272];\
1244     uint8_t halfV[256];\
1245     uint8_t halfHV[256];\
1246     copy_block17(full, src, 24, stride, 17);\
1247     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1248     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1249     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1250     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1251 }\
1252 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1253     uint8_t full[24*17];\
1254     uint8_t halfH[272];\
1255     uint8_t halfHV[256];\
1256     copy_block17(full, src, 24, stride, 17);\
1257     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1258     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1259     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1260     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1261 }\
1262 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1263     uint8_t halfH[272];\
1264     uint8_t halfHV[256];\
1265     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1266     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1267     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1268 }\
1269 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1270     uint8_t halfH[272];\
1271     uint8_t halfHV[256];\
1272     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1273     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1274     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1275 }\
1276 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1277     uint8_t full[24*17];\
1278     uint8_t halfH[272];\
1279     uint8_t halfV[256];\
1280     uint8_t halfHV[256];\
1281     copy_block17(full, src, 24, stride, 17);\
1282     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1283     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1284     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1285     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1286 }\
1287 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1288     uint8_t full[24*17];\
1289     uint8_t halfH[272];\
1290     copy_block17(full, src, 24, stride, 17);\
1291     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1292     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1293     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1294 }\
1295 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1296     uint8_t full[24*17];\
1297     uint8_t halfH[272];\
1298     uint8_t halfV[256];\
1299     uint8_t halfHV[256];\
1300     copy_block17(full, src, 24, stride, 17);\
1301     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1302     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1303     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1304     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1305 }\
1306 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1307     uint8_t full[24*17];\
1308     uint8_t halfH[272];\
1309     copy_block17(full, src, 24, stride, 17);\
1310     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1311     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1312     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1313 }\
1314 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1315     uint8_t halfH[272];\
1316     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1317     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1318 }
1319
1320 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1321 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1322 #define op_put(a, b) a = cm[((b) + 16)>>5]
1323 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1324
1325 QPEL_MC(0, put_       , _       , op_put)
1326 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1327 QPEL_MC(0, avg_       , _       , op_avg)
1328 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1329 #undef op_avg
1330 #undef op_avg_no_rnd
1331 #undef op_put
1332 #undef op_put_no_rnd
1333
1334 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
1335 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1336 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1337 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1338 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1339 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1340
1341 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1342     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1343     int i;
1344
1345     for(i=0; i<h; i++){
1346         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1347         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1348         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1349         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1350         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1351         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1352         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1353         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1354         dst+=dstStride;
1355         src+=srcStride;
1356     }
1357 }
1358
1359 #if CONFIG_RV40_DECODER
1360 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1361     put_pixels16_xy2_8_c(dst, src, stride, 16);
1362 }
1363 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1364     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1365 }
1366 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1367     put_pixels8_xy2_8_c(dst, src, stride, 8);
1368 }
1369 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1370     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1371 }
1372 #endif /* CONFIG_RV40_DECODER */
1373
1374 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1375     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1376     int i;
1377
1378     for(i=0; i<w; i++){
1379         const int src_1= src[ -srcStride];
1380         const int src0 = src[0          ];
1381         const int src1 = src[  srcStride];
1382         const int src2 = src[2*srcStride];
1383         const int src3 = src[3*srcStride];
1384         const int src4 = src[4*srcStride];
1385         const int src5 = src[5*srcStride];
1386         const int src6 = src[6*srcStride];
1387         const int src7 = src[7*srcStride];
1388         const int src8 = src[8*srcStride];
1389         const int src9 = src[9*srcStride];
1390         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1391         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1392         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1393         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1394         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1395         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1396         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1397         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1398         src++;
1399         dst++;
1400     }
1401 }
1402
1403 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1404     uint8_t half[64];
1405     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1406     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1407 }
1408
1409 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1410     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1411 }
1412
1413 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1414     uint8_t half[64];
1415     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1416     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1417 }
1418
1419 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1420     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1421 }
1422
1423 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1424     uint8_t halfH[88];
1425     uint8_t halfV[64];
1426     uint8_t halfHV[64];
1427     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1428     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1429     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1430     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1431 }
1432 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1433     uint8_t halfH[88];
1434     uint8_t halfV[64];
1435     uint8_t halfHV[64];
1436     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1437     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1438     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1439     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1440 }
1441 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1442     uint8_t halfH[88];
1443     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1444     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1445 }
1446
1447 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1448     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1449     int x;
1450     const int strength= ff_h263_loop_filter_strength[qscale];
1451
1452     for(x=0; x<8; x++){
1453         int d1, d2, ad1;
1454         int p0= src[x-2*stride];
1455         int p1= src[x-1*stride];
1456         int p2= src[x+0*stride];
1457         int p3= src[x+1*stride];
1458         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1459
1460         if     (d<-2*strength) d1= 0;
1461         else if(d<-  strength) d1=-2*strength - d;
1462         else if(d<   strength) d1= d;
1463         else if(d< 2*strength) d1= 2*strength - d;
1464         else                   d1= 0;
1465
1466         p1 += d1;
1467         p2 -= d1;
1468         if(p1&256) p1= ~(p1>>31);
1469         if(p2&256) p2= ~(p2>>31);
1470
1471         src[x-1*stride] = p1;
1472         src[x+0*stride] = p2;
1473
1474         ad1= FFABS(d1)>>1;
1475
1476         d2= av_clip((p0-p3)/4, -ad1, ad1);
1477
1478         src[x-2*stride] = p0 - d2;
1479         src[x+  stride] = p3 + d2;
1480     }
1481     }
1482 }
1483
1484 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1485     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1486     int y;
1487     const int strength= ff_h263_loop_filter_strength[qscale];
1488
1489     for(y=0; y<8; y++){
1490         int d1, d2, ad1;
1491         int p0= src[y*stride-2];
1492         int p1= src[y*stride-1];
1493         int p2= src[y*stride+0];
1494         int p3= src[y*stride+1];
1495         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1496
1497         if     (d<-2*strength) d1= 0;
1498         else if(d<-  strength) d1=-2*strength - d;
1499         else if(d<   strength) d1= d;
1500         else if(d< 2*strength) d1= 2*strength - d;
1501         else                   d1= 0;
1502
1503         p1 += d1;
1504         p2 -= d1;
1505         if(p1&256) p1= ~(p1>>31);
1506         if(p2&256) p2= ~(p2>>31);
1507
1508         src[y*stride-1] = p1;
1509         src[y*stride+0] = p2;
1510
1511         ad1= FFABS(d1)>>1;
1512
1513         d2= av_clip((p0-p3)/4, -ad1, ad1);
1514
1515         src[y*stride-2] = p0 - d2;
1516         src[y*stride+1] = p3 + d2;
1517     }
1518     }
1519 }
1520
1521 static void h261_loop_filter_c(uint8_t *src, int stride){
1522     int x,y,xy,yz;
1523     int temp[64];
1524
1525     for(x=0; x<8; x++){
1526         temp[x      ] = 4*src[x           ];
1527         temp[x + 7*8] = 4*src[x + 7*stride];
1528     }
1529     for(y=1; y<7; y++){
1530         for(x=0; x<8; x++){
1531             xy = y * stride + x;
1532             yz = y * 8 + x;
1533             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1534         }
1535     }
1536
1537     for(y=0; y<8; y++){
1538         src[  y*stride] = (temp[  y*8] + 2)>>2;
1539         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1540         for(x=1; x<7; x++){
1541             xy = y * stride + x;
1542             yz = y * 8 + x;
1543             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1544         }
1545     }
1546 }
1547
1548 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1549 {
1550     int s, i;
1551
1552     s = 0;
1553     for(i=0;i<h;i++) {
1554         s += abs(pix1[0] - pix2[0]);
1555         s += abs(pix1[1] - pix2[1]);
1556         s += abs(pix1[2] - pix2[2]);
1557         s += abs(pix1[3] - pix2[3]);
1558         s += abs(pix1[4] - pix2[4]);
1559         s += abs(pix1[5] - pix2[5]);
1560         s += abs(pix1[6] - pix2[6]);
1561         s += abs(pix1[7] - pix2[7]);
1562         s += abs(pix1[8] - pix2[8]);
1563         s += abs(pix1[9] - pix2[9]);
1564         s += abs(pix1[10] - pix2[10]);
1565         s += abs(pix1[11] - pix2[11]);
1566         s += abs(pix1[12] - pix2[12]);
1567         s += abs(pix1[13] - pix2[13]);
1568         s += abs(pix1[14] - pix2[14]);
1569         s += abs(pix1[15] - pix2[15]);
1570         pix1 += line_size;
1571         pix2 += line_size;
1572     }
1573     return s;
1574 }
1575
1576 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1577 {
1578     int s, i;
1579
1580     s = 0;
1581     for(i=0;i<h;i++) {
1582         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1583         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1584         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1585         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1586         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1587         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1588         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1589         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1590         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1591         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1592         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1593         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1594         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1595         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1596         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1597         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1598         pix1 += line_size;
1599         pix2 += line_size;
1600     }
1601     return s;
1602 }
1603
1604 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1605 {
1606     int s, i;
1607     uint8_t *pix3 = pix2 + line_size;
1608
1609     s = 0;
1610     for(i=0;i<h;i++) {
1611         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1612         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1613         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1614         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1615         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1616         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1617         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1618         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1619         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1620         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1621         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1622         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1623         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1624         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1625         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1626         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1627         pix1 += line_size;
1628         pix2 += line_size;
1629         pix3 += line_size;
1630     }
1631     return s;
1632 }
1633
1634 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1635 {
1636     int s, i;
1637     uint8_t *pix3 = pix2 + line_size;
1638
1639     s = 0;
1640     for(i=0;i<h;i++) {
1641         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1642         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1643         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1644         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1645         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1646         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1647         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1648         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1649         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1650         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1651         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1652         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1653         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1654         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1655         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1656         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1657         pix1 += line_size;
1658         pix2 += line_size;
1659         pix3 += line_size;
1660     }
1661     return s;
1662 }
1663
1664 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1665 {
1666     int s, i;
1667
1668     s = 0;
1669     for(i=0;i<h;i++) {
1670         s += abs(pix1[0] - pix2[0]);
1671         s += abs(pix1[1] - pix2[1]);
1672         s += abs(pix1[2] - pix2[2]);
1673         s += abs(pix1[3] - pix2[3]);
1674         s += abs(pix1[4] - pix2[4]);
1675         s += abs(pix1[5] - pix2[5]);
1676         s += abs(pix1[6] - pix2[6]);
1677         s += abs(pix1[7] - pix2[7]);
1678         pix1 += line_size;
1679         pix2 += line_size;
1680     }
1681     return s;
1682 }
1683
1684 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1685 {
1686     int s, i;
1687
1688     s = 0;
1689     for(i=0;i<h;i++) {
1690         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1691         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1692         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1693         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1694         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1695         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1696         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1697         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1698         pix1 += line_size;
1699         pix2 += line_size;
1700     }
1701     return s;
1702 }
1703
1704 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1705 {
1706     int s, i;
1707     uint8_t *pix3 = pix2 + line_size;
1708
1709     s = 0;
1710     for(i=0;i<h;i++) {
1711         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1712         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1713         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1714         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1715         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1716         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1717         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1718         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1719         pix1 += line_size;
1720         pix2 += line_size;
1721         pix3 += line_size;
1722     }
1723     return s;
1724 }
1725
1726 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1727 {
1728     int s, i;
1729     uint8_t *pix3 = pix2 + line_size;
1730
1731     s = 0;
1732     for(i=0;i<h;i++) {
1733         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1734         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1735         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1736         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1737         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1738         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1739         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1740         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1741         pix1 += line_size;
1742         pix2 += line_size;
1743         pix3 += line_size;
1744     }
1745     return s;
1746 }
1747
1748 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1749     MpegEncContext *c = v;
1750     int score1=0;
1751     int score2=0;
1752     int x,y;
1753
1754     for(y=0; y<h; y++){
1755         for(x=0; x<16; x++){
1756             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1757         }
1758         if(y+1<h){
1759             for(x=0; x<15; x++){
1760                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1761                              - s1[x+1] + s1[x+1+stride])
1762                         -FFABS(  s2[x  ] - s2[x  +stride]
1763                              - s2[x+1] + s2[x+1+stride]);
1764             }
1765         }
1766         s1+= stride;
1767         s2+= stride;
1768     }
1769
1770     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1771     else  return score1 + FFABS(score2)*8;
1772 }
1773
1774 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1775     MpegEncContext *c = v;
1776     int score1=0;
1777     int score2=0;
1778     int x,y;
1779
1780     for(y=0; y<h; y++){
1781         for(x=0; x<8; x++){
1782             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1783         }
1784         if(y+1<h){
1785             for(x=0; x<7; x++){
1786                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
1787                              - s1[x+1] + s1[x+1+stride])
1788                         -FFABS(  s2[x  ] - s2[x  +stride]
1789                              - s2[x+1] + s2[x+1+stride]);
1790             }
1791         }
1792         s1+= stride;
1793         s2+= stride;
1794     }
1795
1796     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1797     else  return score1 + FFABS(score2)*8;
1798 }
1799
1800 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1801     int i;
1802     unsigned int sum=0;
1803
1804     for(i=0; i<8*8; i++){
1805         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1806         int w= weight[i];
1807         b>>= RECON_SHIFT;
1808         assert(-512<b && b<512);
1809
1810         sum += (w*b)*(w*b)>>4;
1811     }
1812     return sum>>2;
1813 }
1814
1815 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1816     int i;
1817
1818     for(i=0; i<8*8; i++){
1819         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1820     }
1821 }
1822
1823 /**
1824  * permutes an 8x8 block.
1825  * @param block the block which will be permuted according to the given permutation vector
1826  * @param permutation the permutation vector
1827  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1828  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1829  *                  (inverse) permutated to scantable order!
1830  */
1831 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1832 {
1833     int i;
1834     DCTELEM temp[64];
1835
1836     if(last<=0) return;
1837     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1838
1839     for(i=0; i<=last; i++){
1840         const int j= scantable[i];
1841         temp[j]= block[j];
1842         block[j]=0;
1843     }
1844
1845     for(i=0; i<=last; i++){
1846         const int j= scantable[i];
1847         const int perm_j= permutation[j];
1848         block[perm_j]= temp[j];
1849     }
1850 }
1851
1852 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1853     return 0;
1854 }
1855
1856 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1857     int i;
1858
1859     memset(cmp, 0, sizeof(void*)*6);
1860
1861     for(i=0; i<6; i++){
1862         switch(type&0xFF){
1863         case FF_CMP_SAD:
1864             cmp[i]= c->sad[i];
1865             break;
1866         case FF_CMP_SATD:
1867             cmp[i]= c->hadamard8_diff[i];
1868             break;
1869         case FF_CMP_SSE:
1870             cmp[i]= c->sse[i];
1871             break;
1872         case FF_CMP_DCT:
1873             cmp[i]= c->dct_sad[i];
1874             break;
1875         case FF_CMP_DCT264:
1876             cmp[i]= c->dct264_sad[i];
1877             break;
1878         case FF_CMP_DCTMAX:
1879             cmp[i]= c->dct_max[i];
1880             break;
1881         case FF_CMP_PSNR:
1882             cmp[i]= c->quant_psnr[i];
1883             break;
1884         case FF_CMP_BIT:
1885             cmp[i]= c->bit[i];
1886             break;
1887         case FF_CMP_RD:
1888             cmp[i]= c->rd[i];
1889             break;
1890         case FF_CMP_VSAD:
1891             cmp[i]= c->vsad[i];
1892             break;
1893         case FF_CMP_VSSE:
1894             cmp[i]= c->vsse[i];
1895             break;
1896         case FF_CMP_ZERO:
1897             cmp[i]= zero_cmp;
1898             break;
1899         case FF_CMP_NSSE:
1900             cmp[i]= c->nsse[i];
1901             break;
1902 #if CONFIG_DWT
1903         case FF_CMP_W53:
1904             cmp[i]= c->w53[i];
1905             break;
1906         case FF_CMP_W97:
1907             cmp[i]= c->w97[i];
1908             break;
1909 #endif
1910         default:
1911             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1912         }
1913     }
1914 }
1915
1916 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1917     long i;
1918     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1919         long a = *(long*)(src+i);
1920         long b = *(long*)(dst+i);
1921         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1922     }
1923     for(; i<w; i++)
1924         dst[i+0] += src[i+0];
1925 }
1926
1927 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1928     long i;
1929     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1930         long a = *(long*)(src1+i);
1931         long b = *(long*)(src2+i);
1932         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1933     }
1934     for(; i<w; i++)
1935         dst[i] = src1[i]+src2[i];
1936 }
1937
1938 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1939     long i;
1940 #if !HAVE_FAST_UNALIGNED
1941     if((long)src2 & (sizeof(long)-1)){
1942         for(i=0; i+7<w; i+=8){
1943             dst[i+0] = src1[i+0]-src2[i+0];
1944             dst[i+1] = src1[i+1]-src2[i+1];
1945             dst[i+2] = src1[i+2]-src2[i+2];
1946             dst[i+3] = src1[i+3]-src2[i+3];
1947             dst[i+4] = src1[i+4]-src2[i+4];
1948             dst[i+5] = src1[i+5]-src2[i+5];
1949             dst[i+6] = src1[i+6]-src2[i+6];
1950             dst[i+7] = src1[i+7]-src2[i+7];
1951         }
1952     }else
1953 #endif
1954     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1955         long a = *(long*)(src1+i);
1956         long b = *(long*)(src2+i);
1957         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1958     }
1959     for(; i<w; i++)
1960         dst[i+0] = src1[i+0]-src2[i+0];
1961 }
1962
1963 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1964     int i;
1965     uint8_t l, lt;
1966
1967     l= *left;
1968     lt= *left_top;
1969
1970     for(i=0; i<w; i++){
1971         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1972         lt= src1[i];
1973         dst[i]= l;
1974     }
1975
1976     *left= l;
1977     *left_top= lt;
1978 }
1979
1980 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1981     int i;
1982     uint8_t l, lt;
1983
1984     l= *left;
1985     lt= *left_top;
1986
1987     for(i=0; i<w; i++){
1988         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1989         lt= src1[i];
1990         l= src2[i];
1991         dst[i]= l - pred;
1992     }
1993
1994     *left= l;
1995     *left_top= lt;
1996 }
1997
1998 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1999     int i;
2000
2001     for(i=0; i<w-1; i++){
2002         acc+= src[i];
2003         dst[i]= acc;
2004         i++;
2005         acc+= src[i];
2006         dst[i]= acc;
2007     }
2008
2009     for(; i<w; i++){
2010         acc+= src[i];
2011         dst[i]= acc;
2012     }
2013
2014     return acc;
2015 }
2016
2017 #if HAVE_BIGENDIAN
2018 #define B 3
2019 #define G 2
2020 #define R 1
2021 #define A 0
2022 #else
2023 #define B 0
2024 #define G 1
2025 #define R 2
2026 #define A 3
2027 #endif
2028 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2029     int i;
2030     int r,g,b,a;
2031     r= *red;
2032     g= *green;
2033     b= *blue;
2034     a= *alpha;
2035
2036     for(i=0; i<w; i++){
2037         b+= src[4*i+B];
2038         g+= src[4*i+G];
2039         r+= src[4*i+R];
2040         a+= src[4*i+A];
2041
2042         dst[4*i+B]= b;
2043         dst[4*i+G]= g;
2044         dst[4*i+R]= r;
2045         dst[4*i+A]= a;
2046     }
2047
2048     *red= r;
2049     *green= g;
2050     *blue= b;
2051     *alpha= a;
2052 }
2053 #undef B
2054 #undef G
2055 #undef R
2056 #undef A
2057
2058 #define BUTTERFLY2(o1,o2,i1,i2) \
2059 o1= (i1)+(i2);\
2060 o2= (i1)-(i2);
2061
2062 #define BUTTERFLY1(x,y) \
2063 {\
2064     int a,b;\
2065     a= x;\
2066     b= y;\
2067     x= a+b;\
2068     y= a-b;\
2069 }
2070
2071 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2072
2073 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2074     int i;
2075     int temp[64];
2076     int sum=0;
2077
2078     assert(h==8);
2079
2080     for(i=0; i<8; i++){
2081         //FIXME try pointer walks
2082         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2083         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2084         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2085         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2086
2087         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2088         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2089         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2090         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2091
2092         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2093         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2094         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2095         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2096     }
2097
2098     for(i=0; i<8; i++){
2099         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2100         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2101         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2102         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2103
2104         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2105         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2106         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2107         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2108
2109         sum +=
2110              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2111             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2112             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2113             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2114     }
2115     return sum;
2116 }
2117
2118 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2119     int i;
2120     int temp[64];
2121     int sum=0;
2122
2123     assert(h==8);
2124
2125     for(i=0; i<8; i++){
2126         //FIXME try pointer walks
2127         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2128         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2129         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2130         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2131
2132         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2133         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2134         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2135         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2136
2137         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2138         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2139         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2140         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2141     }
2142
2143     for(i=0; i<8; i++){
2144         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2145         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2146         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2147         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2148
2149         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2150         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2151         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2152         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2153
2154         sum +=
2155              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2156             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2157             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2158             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2159     }
2160
2161     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2162
2163     return sum;
2164 }
2165
2166 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2167     MpegEncContext * const s= (MpegEncContext *)c;
2168     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2169
2170     assert(h==8);
2171
2172     s->dsp.diff_pixels(temp, src1, src2, stride);
2173     s->dsp.fdct(temp);
2174     return s->dsp.sum_abs_dctelem(temp);
2175 }
2176
2177 #if CONFIG_GPL
2178 #define DCT8_1D {\
2179     const int s07 = SRC(0) + SRC(7);\
2180     const int s16 = SRC(1) + SRC(6);\
2181     const int s25 = SRC(2) + SRC(5);\
2182     const int s34 = SRC(3) + SRC(4);\
2183     const int a0 = s07 + s34;\
2184     const int a1 = s16 + s25;\
2185     const int a2 = s07 - s34;\
2186     const int a3 = s16 - s25;\
2187     const int d07 = SRC(0) - SRC(7);\
2188     const int d16 = SRC(1) - SRC(6);\
2189     const int d25 = SRC(2) - SRC(5);\
2190     const int d34 = SRC(3) - SRC(4);\
2191     const int a4 = d16 + d25 + (d07 + (d07>>1));\
2192     const int a5 = d07 - d34 - (d25 + (d25>>1));\
2193     const int a6 = d07 + d34 - (d16 + (d16>>1));\
2194     const int a7 = d16 - d25 + (d34 + (d34>>1));\
2195     DST(0,  a0 + a1     ) ;\
2196     DST(1,  a4 + (a7>>2)) ;\
2197     DST(2,  a2 + (a3>>1)) ;\
2198     DST(3,  a5 + (a6>>2)) ;\
2199     DST(4,  a0 - a1     ) ;\
2200     DST(5,  a6 - (a5>>2)) ;\
2201     DST(6, (a2>>1) - a3 ) ;\
2202     DST(7, (a4>>2) - a7 ) ;\
2203 }
2204
2205 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2206     MpegEncContext * const s= (MpegEncContext *)c;
2207     DCTELEM dct[8][8];
2208     int i;
2209     int sum=0;
2210
2211     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2212
2213 #define SRC(x) dct[i][x]
2214 #define DST(x,v) dct[i][x]= v
2215     for( i = 0; i < 8; i++ )
2216         DCT8_1D
2217 #undef SRC
2218 #undef DST
2219
2220 #define SRC(x) dct[x][i]
2221 #define DST(x,v) sum += FFABS(v)
2222     for( i = 0; i < 8; i++ )
2223         DCT8_1D
2224 #undef SRC
2225 #undef DST
2226     return sum;
2227 }
2228 #endif
2229
2230 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2231     MpegEncContext * const s= (MpegEncContext *)c;
2232     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2233     int sum=0, i;
2234
2235     assert(h==8);
2236
2237     s->dsp.diff_pixels(temp, src1, src2, stride);
2238     s->dsp.fdct(temp);
2239
2240     for(i=0; i<64; i++)
2241         sum= FFMAX(sum, FFABS(temp[i]));
2242
2243     return sum;
2244 }
2245
2246 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2247     MpegEncContext * const s= (MpegEncContext *)c;
2248     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2249     DCTELEM * const bak = temp+64;
2250     int sum=0, i;
2251
2252     assert(h==8);
2253     s->mb_intra=0;
2254
2255     s->dsp.diff_pixels(temp, src1, src2, stride);
2256
2257     memcpy(bak, temp, 64*sizeof(DCTELEM));
2258
2259     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2260     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2261     ff_simple_idct(temp); //FIXME
2262
2263     for(i=0; i<64; i++)
2264         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2265
2266     return sum;
2267 }
2268
2269 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2270     MpegEncContext * const s= (MpegEncContext *)c;
2271     const uint8_t *scantable= s->intra_scantable.permutated;
2272     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2273     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2274     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2275     int i, last, run, bits, level, distortion, start_i;
2276     const int esc_length= s->ac_esc_length;
2277     uint8_t * length;
2278     uint8_t * last_length;
2279
2280     assert(h==8);
2281
2282     copy_block8(lsrc1, src1, 8, stride, 8);
2283     copy_block8(lsrc2, src2, 8, stride, 8);
2284
2285     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2286
2287     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2288
2289     bits=0;
2290
2291     if (s->mb_intra) {
2292         start_i = 1;
2293         length     = s->intra_ac_vlc_length;
2294         last_length= s->intra_ac_vlc_last_length;
2295         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2296     } else {
2297         start_i = 0;
2298         length     = s->inter_ac_vlc_length;
2299         last_length= s->inter_ac_vlc_last_length;
2300     }
2301
2302     if(last>=start_i){
2303         run=0;
2304         for(i=start_i; i<last; i++){
2305             int j= scantable[i];
2306             level= temp[j];
2307
2308             if(level){
2309                 level+=64;
2310                 if((level&(~127)) == 0){
2311                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2312                 }else
2313                     bits+= esc_length;
2314                 run=0;
2315             }else
2316                 run++;
2317         }
2318         i= scantable[last];
2319
2320         level= temp[i] + 64;
2321
2322         assert(level - 64);
2323
2324         if((level&(~127)) == 0){
2325             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2326         }else
2327             bits+= esc_length;
2328
2329     }
2330
2331     if(last>=0){
2332         if(s->mb_intra)
2333             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2334         else
2335             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2336     }
2337
2338     s->dsp.idct_add(lsrc2, 8, temp);
2339
2340     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2341
2342     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2343 }
2344
2345 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2346     MpegEncContext * const s= (MpegEncContext *)c;
2347     const uint8_t *scantable= s->intra_scantable.permutated;
2348     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2349     int i, last, run, bits, level, start_i;
2350     const int esc_length= s->ac_esc_length;
2351     uint8_t * length;
2352     uint8_t * last_length;
2353
2354     assert(h==8);
2355
2356     s->dsp.diff_pixels(temp, src1, src2, stride);
2357
2358     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2359
2360     bits=0;
2361
2362     if (s->mb_intra) {
2363         start_i = 1;
2364         length     = s->intra_ac_vlc_length;
2365         last_length= s->intra_ac_vlc_last_length;
2366         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2367     } else {
2368         start_i = 0;
2369         length     = s->inter_ac_vlc_length;
2370         last_length= s->inter_ac_vlc_last_length;
2371     }
2372
2373     if(last>=start_i){
2374         run=0;
2375         for(i=start_i; i<last; i++){
2376             int j= scantable[i];
2377             level= temp[j];
2378
2379             if(level){
2380                 level+=64;
2381                 if((level&(~127)) == 0){
2382                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2383                 }else
2384                     bits+= esc_length;
2385                 run=0;
2386             }else
2387                 run++;
2388         }
2389         i= scantable[last];
2390
2391         level= temp[i] + 64;
2392
2393         assert(level - 64);
2394
2395         if((level&(~127)) == 0){
2396             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2397         }else
2398             bits+= esc_length;
2399     }
2400
2401     return bits;
2402 }
2403
2404 #define VSAD_INTRA(size) \
2405 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2406     int score=0;                                                                                            \
2407     int x,y;                                                                                                \
2408                                                                                                             \
2409     for(y=1; y<h; y++){                                                                                     \
2410         for(x=0; x<size; x+=4){                                                                             \
2411             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2412                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2413         }                                                                                                   \
2414         s+= stride;                                                                                         \
2415     }                                                                                                       \
2416                                                                                                             \
2417     return score;                                                                                           \
2418 }
2419 VSAD_INTRA(8)
2420 VSAD_INTRA(16)
2421
2422 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2423     int score=0;
2424     int x,y;
2425
2426     for(y=1; y<h; y++){
2427         for(x=0; x<16; x++){
2428             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2429         }
2430         s1+= stride;
2431         s2+= stride;
2432     }
2433
2434     return score;
2435 }
2436
2437 #define SQ(a) ((a)*(a))
2438 #define VSSE_INTRA(size) \
2439 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2440     int score=0;                                                                                            \
2441     int x,y;                                                                                                \
2442                                                                                                             \
2443     for(y=1; y<h; y++){                                                                                     \
2444         for(x=0; x<size; x+=4){                                                                               \
2445             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2446                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2447         }                                                                                                   \
2448         s+= stride;                                                                                         \
2449     }                                                                                                       \
2450                                                                                                             \
2451     return score;                                                                                           \
2452 }
2453 VSSE_INTRA(8)
2454 VSSE_INTRA(16)
2455
2456 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2457     int score=0;
2458     int x,y;
2459
2460     for(y=1; y<h; y++){
2461         for(x=0; x<16; x++){
2462             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2463         }
2464         s1+= stride;
2465         s2+= stride;
2466     }
2467
2468     return score;
2469 }
2470
2471 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2472                                int size){
2473     int score=0;
2474     int i;
2475     for(i=0; i<size; i++)
2476         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2477     return score;
2478 }
2479
2480 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2481 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2482 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2483 #if CONFIG_GPL
2484 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2485 #endif
2486 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2487 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2488 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2489 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2490
2491 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2492     int i;
2493     for(i=0; i<len; i++)
2494         dst[i] = src0[i] * src1[i];
2495 }
2496
2497 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2498     int i;
2499     src1 += len-1;
2500     for(i=0; i<len; i++)
2501         dst[i] = src0[i] * src1[-i];
2502 }
2503
2504 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2505     int i;
2506     for(i=0; i<len; i++)
2507         dst[i] = src0[i] * src1[i] + src2[i];
2508 }
2509
2510 static void vector_fmul_window_c(float *dst, const float *src0,
2511                                  const float *src1, const float *win, int len)
2512 {
2513     int i,j;
2514     dst += len;
2515     win += len;
2516     src0+= len;
2517     for(i=-len, j=len-1; i<0; i++, j--) {
2518         float s0 = src0[i];
2519         float s1 = src1[j];
2520         float wi = win[i];
2521         float wj = win[j];
2522         dst[i] = s0*wj - s1*wi;
2523         dst[j] = s0*wi + s1*wj;
2524     }
2525 }
2526
2527 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2528                                  int len)
2529 {
2530     int i;
2531     for (i = 0; i < len; i++)
2532         dst[i] = src[i] * mul;
2533 }
2534
2535 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
2536                                       const float **sv, float mul, int len)
2537 {
2538     int i;
2539     for (i = 0; i < len; i += 2, sv++) {
2540         dst[i  ] = src[i  ] * sv[0][0] * mul;
2541         dst[i+1] = src[i+1] * sv[0][1] * mul;
2542     }
2543 }
2544
2545 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
2546                                       const float **sv, float mul, int len)
2547 {
2548     int i;
2549     for (i = 0; i < len; i += 4, sv++) {
2550         dst[i  ] = src[i  ] * sv[0][0] * mul;
2551         dst[i+1] = src[i+1] * sv[0][1] * mul;
2552         dst[i+2] = src[i+2] * sv[0][2] * mul;
2553         dst[i+3] = src[i+3] * sv[0][3] * mul;
2554     }
2555 }
2556
2557 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
2558                                int len)
2559 {
2560     int i;
2561     for (i = 0; i < len; i += 2, sv++) {
2562         dst[i  ] = sv[0][0] * mul;
2563         dst[i+1] = sv[0][1] * mul;
2564     }
2565 }
2566
2567 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
2568                                int len)
2569 {
2570     int i;
2571     for (i = 0; i < len; i += 4, sv++) {
2572         dst[i  ] = sv[0][0] * mul;
2573         dst[i+1] = sv[0][1] * mul;
2574         dst[i+2] = sv[0][2] * mul;
2575         dst[i+3] = sv[0][3] * mul;
2576     }
2577 }
2578
2579 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2580                                 int len)
2581 {
2582     int i;
2583     for (i = 0; i < len; i++) {
2584         float t = v1[i] - v2[i];
2585         v1[i] += v2[i];
2586         v2[i] = t;
2587     }
2588 }
2589
2590 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2591 {
2592     float p = 0.0;
2593     int i;
2594
2595     for (i = 0; i < len; i++)
2596         p += v1[i] * v2[i];
2597
2598     return p;
2599 }
2600
2601 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2602                    uint32_t maxi, uint32_t maxisign)
2603 {
2604
2605     if(a > mini) return mini;
2606     else if((a^(1U<<31)) > maxisign) return maxi;
2607     else return a;
2608 }
2609
2610 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2611     int i;
2612     uint32_t mini = *(uint32_t*)min;
2613     uint32_t maxi = *(uint32_t*)max;
2614     uint32_t maxisign = maxi ^ (1U<<31);
2615     uint32_t *dsti = (uint32_t*)dst;
2616     const uint32_t *srci = (const uint32_t*)src;
2617     for(i=0; i<len; i+=8) {
2618         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2619         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2620         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2621         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2622         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2623         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2624         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2625         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2626     }
2627 }
2628 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2629     int i;
2630     if(min < 0 && max > 0) {
2631         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2632     } else {
2633         for(i=0; i < len; i+=8) {
2634             dst[i    ] = av_clipf(src[i    ], min, max);
2635             dst[i + 1] = av_clipf(src[i + 1], min, max);
2636             dst[i + 2] = av_clipf(src[i + 2], min, max);
2637             dst[i + 3] = av_clipf(src[i + 3], min, max);
2638             dst[i + 4] = av_clipf(src[i + 4], min, max);
2639             dst[i + 5] = av_clipf(src[i + 5], min, max);
2640             dst[i + 6] = av_clipf(src[i + 6], min, max);
2641             dst[i + 7] = av_clipf(src[i + 7], min, max);
2642         }
2643     }
2644 }
2645
2646 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2647 {
2648     int res = 0;
2649
2650     while (order--)
2651         res += (*v1++ * *v2++) >> shift;
2652
2653     return res;
2654 }
2655
2656 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2657 {
2658     int res = 0;
2659     while (order--) {
2660         res   += *v1 * *v2++;
2661         *v1++ += mul * *v3++;
2662     }
2663     return res;
2664 }
2665
2666 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2667                                  const int16_t *window, unsigned int len)
2668 {
2669     int i;
2670     int len2 = len >> 1;
2671
2672     for (i = 0; i < len2; i++) {
2673         int16_t w       = window[i];
2674         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2675         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2676     }
2677 }
2678
2679 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2680                                 int32_t max, unsigned int len)
2681 {
2682     do {
2683         *dst++ = av_clip(*src++, min, max);
2684         *dst++ = av_clip(*src++, min, max);
2685         *dst++ = av_clip(*src++, min, max);
2686         *dst++ = av_clip(*src++, min, max);
2687         *dst++ = av_clip(*src++, min, max);
2688         *dst++ = av_clip(*src++, min, max);
2689         *dst++ = av_clip(*src++, min, max);
2690         *dst++ = av_clip(*src++, min, max);
2691         len -= 8;
2692     } while (len > 0);
2693 }
2694
2695 #define W0 2048
2696 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2697 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2698 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2699 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2700 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2701 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2702 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2703
2704 static void wmv2_idct_row(short * b)
2705 {
2706     int s1,s2;
2707     int a0,a1,a2,a3,a4,a5,a6,a7;
2708     /*step 1*/
2709     a1 = W1*b[1]+W7*b[7];
2710     a7 = W7*b[1]-W1*b[7];
2711     a5 = W5*b[5]+W3*b[3];
2712     a3 = W3*b[5]-W5*b[3];
2713     a2 = W2*b[2]+W6*b[6];
2714     a6 = W6*b[2]-W2*b[6];
2715     a0 = W0*b[0]+W0*b[4];
2716     a4 = W0*b[0]-W0*b[4];
2717     /*step 2*/
2718     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2719     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2720     /*step 3*/
2721     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2722     b[1] = (a4+a6 +s1   + (1<<7))>>8;
2723     b[2] = (a4-a6 +s2   + (1<<7))>>8;
2724     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2725     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2726     b[5] = (a4-a6 -s2   + (1<<7))>>8;
2727     b[6] = (a4+a6 -s1   + (1<<7))>>8;
2728     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2729 }
2730 static void wmv2_idct_col(short * b)
2731 {
2732     int s1,s2;
2733     int a0,a1,a2,a3,a4,a5,a6,a7;
2734     /*step 1, with extended precision*/
2735     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2736     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2737     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2738     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2739     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2740     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2741     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2742     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2743     /*step 2*/
2744     s1 = (181*(a1-a5+a7-a3)+128)>>8;
2745     s2 = (181*(a1-a5-a7+a3)+128)>>8;
2746     /*step 3*/
2747     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2748     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2749     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2750     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2751
2752     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2753     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2754     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2755     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2756 }
2757 void ff_wmv2_idct_c(short * block){
2758     int i;
2759
2760     for(i=0;i<64;i+=8){
2761         wmv2_idct_row(block+i);
2762     }
2763     for(i=0;i<8;i++){
2764         wmv2_idct_col(block+i);
2765     }
2766 }
2767 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2768  converted */
2769 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2770 {
2771     ff_wmv2_idct_c(block);
2772     ff_put_pixels_clamped_c(block, dest, line_size);
2773 }
2774 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2775 {
2776     ff_wmv2_idct_c(block);
2777     ff_add_pixels_clamped_c(block, dest, line_size);
2778 }
2779 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2780 {
2781     j_rev_dct (block);
2782     ff_put_pixels_clamped_c(block, dest, line_size);
2783 }
2784 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2785 {
2786     j_rev_dct (block);
2787     ff_add_pixels_clamped_c(block, dest, line_size);
2788 }
2789
2790 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2791 {
2792     j_rev_dct4 (block);
2793     put_pixels_clamped4_c(block, dest, line_size);
2794 }
2795 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2796 {
2797     j_rev_dct4 (block);
2798     add_pixels_clamped4_c(block, dest, line_size);
2799 }
2800
2801 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2802 {
2803     j_rev_dct2 (block);
2804     put_pixels_clamped2_c(block, dest, line_size);
2805 }
2806 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2807 {
2808     j_rev_dct2 (block);
2809     add_pixels_clamped2_c(block, dest, line_size);
2810 }
2811
2812 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2813 {
2814     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2815
2816     dest[0] = cm[(block[0] + 4)>>3];
2817 }
2818 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2819 {
2820     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2821
2822     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2823 }
2824
2825 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2826
2827 /* init static data */
2828 av_cold void dsputil_static_init(void)
2829 {
2830     int i;
2831
2832     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2833     for(i=0;i<MAX_NEG_CROP;i++) {
2834         ff_cropTbl[i] = 0;
2835         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2836     }
2837
2838     for(i=0;i<512;i++) {
2839         ff_squareTbl[i] = (i - 256) * (i - 256);
2840     }
2841
2842     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2843 }
2844
2845 int ff_check_alignment(void){
2846     static int did_fail=0;
2847     DECLARE_ALIGNED(16, int, aligned);
2848
2849     if((intptr_t)&aligned & 15){
2850         if(!did_fail){
2851 #if HAVE_MMX || HAVE_ALTIVEC
2852             av_log(NULL, AV_LOG_ERROR,
2853                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2854                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2855                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2856                 "Do not report crashes to Libav developers.\n");
2857 #endif
2858             did_fail=1;
2859         }
2860         return -1;
2861     }
2862     return 0;
2863 }
2864
2865 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2866 {
2867     int i;
2868
2869     ff_check_alignment();
2870
2871 #if CONFIG_ENCODERS
2872     if(avctx->dct_algo==FF_DCT_FASTINT) {
2873         c->fdct = fdct_ifast;
2874         c->fdct248 = fdct_ifast248;
2875     }
2876     else if(avctx->dct_algo==FF_DCT_FAAN) {
2877         c->fdct = ff_faandct;
2878         c->fdct248 = ff_faandct248;
2879     }
2880     else {
2881         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2882         c->fdct248 = ff_fdct248_islow;
2883     }
2884 #endif //CONFIG_ENCODERS
2885
2886     if(avctx->lowres==1){
2887         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
2888             c->idct_put= ff_jref_idct4_put;
2889             c->idct_add= ff_jref_idct4_add;
2890         }else{
2891             if (avctx->codec_id != CODEC_ID_H264) {
2892                 c->idct_put= ff_h264_lowres_idct_put_8_c;
2893                 c->idct_add= ff_h264_lowres_idct_add_8_c;
2894             } else {
2895                 switch (avctx->bits_per_raw_sample) {
2896                     case 9:
2897                         c->idct_put= ff_h264_lowres_idct_put_9_c;
2898                         c->idct_add= ff_h264_lowres_idct_add_9_c;
2899                         break;
2900                     case 10:
2901                         c->idct_put= ff_h264_lowres_idct_put_10_c;
2902                         c->idct_add= ff_h264_lowres_idct_add_10_c;
2903                         break;
2904                     default:
2905                         c->idct_put= ff_h264_lowres_idct_put_8_c;
2906                         c->idct_add= ff_h264_lowres_idct_add_8_c;
2907                 }
2908             }
2909         }
2910         c->idct    = j_rev_dct4;
2911         c->idct_permutation_type= FF_NO_IDCT_PERM;
2912     }else if(avctx->lowres==2){
2913         c->idct_put= ff_jref_idct2_put;
2914         c->idct_add= ff_jref_idct2_add;
2915         c->idct    = j_rev_dct2;
2916         c->idct_permutation_type= FF_NO_IDCT_PERM;
2917     }else if(avctx->lowres==3){
2918         c->idct_put= ff_jref_idct1_put;
2919         c->idct_add= ff_jref_idct1_add;
2920         c->idct    = j_rev_dct1;
2921         c->idct_permutation_type= FF_NO_IDCT_PERM;
2922     }else{
2923         if(avctx->idct_algo==FF_IDCT_INT){
2924             c->idct_put= ff_jref_idct_put;
2925             c->idct_add= ff_jref_idct_add;
2926             c->idct    = j_rev_dct;
2927             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2928         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2929                 avctx->idct_algo==FF_IDCT_VP3){
2930             c->idct_put= ff_vp3_idct_put_c;
2931             c->idct_add= ff_vp3_idct_add_c;
2932             c->idct    = ff_vp3_idct_c;
2933             c->idct_permutation_type= FF_NO_IDCT_PERM;
2934         }else if(avctx->idct_algo==FF_IDCT_WMV2){
2935             c->idct_put= ff_wmv2_idct_put_c;
2936             c->idct_add= ff_wmv2_idct_add_c;
2937             c->idct    = ff_wmv2_idct_c;
2938             c->idct_permutation_type= FF_NO_IDCT_PERM;
2939         }else if(avctx->idct_algo==FF_IDCT_FAAN){
2940             c->idct_put= ff_faanidct_put;
2941             c->idct_add= ff_faanidct_add;
2942             c->idct    = ff_faanidct;
2943             c->idct_permutation_type= FF_NO_IDCT_PERM;
2944         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2945             c->idct_put= ff_ea_idct_put_c;
2946             c->idct_permutation_type= FF_NO_IDCT_PERM;
2947         }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
2948             c->idct     = ff_bink_idct_c;
2949             c->idct_add = ff_bink_idct_add_c;
2950             c->idct_put = ff_bink_idct_put_c;
2951             c->idct_permutation_type = FF_NO_IDCT_PERM;
2952         }else{ //accurate/default
2953             c->idct_put= ff_simple_idct_put;
2954             c->idct_add= ff_simple_idct_add;
2955             c->idct    = ff_simple_idct;
2956             c->idct_permutation_type= FF_NO_IDCT_PERM;
2957         }
2958     }
2959
2960     c->get_pixels = get_pixels_c;
2961     c->diff_pixels = diff_pixels_c;
2962     c->put_pixels_clamped = ff_put_pixels_clamped_c;
2963     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2964     c->put_pixels_nonclamped = put_pixels_nonclamped_c;
2965     c->add_pixels_clamped = ff_add_pixels_clamped_c;
2966     c->sum_abs_dctelem = sum_abs_dctelem_c;
2967     c->gmc1 = gmc1_c;
2968     c->gmc = ff_gmc_c;
2969     c->pix_sum = pix_sum_c;
2970     c->pix_norm1 = pix_norm1_c;
2971
2972     c->fill_block_tab[0] = fill_block16_c;
2973     c->fill_block_tab[1] = fill_block8_c;
2974     c->scale_block = scale_block_c;
2975
2976     /* TODO [0] 16  [1] 8 */
2977     c->pix_abs[0][0] = pix_abs16_c;
2978     c->pix_abs[0][1] = pix_abs16_x2_c;
2979     c->pix_abs[0][2] = pix_abs16_y2_c;
2980     c->pix_abs[0][3] = pix_abs16_xy2_c;
2981     c->pix_abs[1][0] = pix_abs8_c;
2982     c->pix_abs[1][1] = pix_abs8_x2_c;
2983     c->pix_abs[1][2] = pix_abs8_y2_c;
2984     c->pix_abs[1][3] = pix_abs8_xy2_c;
2985
2986     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2987     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2988     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2989     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2990     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2991     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2992     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2993     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2994     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2995
2996     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2997     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2998     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2999     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3000     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3001     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3002     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3003     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3004     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3005
3006 #define dspfunc(PFX, IDX, NUM) \
3007     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3008     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3009     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3010     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3011     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3012     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3013     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3014     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3015     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3016     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3017     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3018     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3019     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3020     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3021     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3022     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3023
3024     dspfunc(put_qpel, 0, 16);
3025     dspfunc(put_no_rnd_qpel, 0, 16);
3026
3027     dspfunc(avg_qpel, 0, 16);
3028     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3029
3030     dspfunc(put_qpel, 1, 8);
3031     dspfunc(put_no_rnd_qpel, 1, 8);
3032
3033     dspfunc(avg_qpel, 1, 8);
3034     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3035
3036 #undef dspfunc
3037
3038 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
3039     ff_mlp_init(c, avctx);
3040 #endif
3041 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
3042     ff_intrax8dsp_init(c,avctx);
3043 #endif
3044 #if CONFIG_RV30_DECODER
3045     ff_rv30dsp_init(c,avctx);
3046 #endif
3047 #if CONFIG_RV40_DECODER
3048     ff_rv40dsp_init(c,avctx);
3049     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
3050     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
3051     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
3052     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
3053 #endif
3054
3055     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
3056     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3057     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3058     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3059     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3060     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3061     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3062     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3063
3064 #define SET_CMP_FUNC(name) \
3065     c->name[0]= name ## 16_c;\
3066     c->name[1]= name ## 8x8_c;
3067
3068     SET_CMP_FUNC(hadamard8_diff)
3069     c->hadamard8_diff[4]= hadamard8_intra16_c;
3070     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3071     SET_CMP_FUNC(dct_sad)
3072     SET_CMP_FUNC(dct_max)
3073 #if CONFIG_GPL
3074     SET_CMP_FUNC(dct264_sad)
3075 #endif
3076     c->sad[0]= pix_abs16_c;
3077     c->sad[1]= pix_abs8_c;
3078     c->sse[0]= sse16_c;
3079     c->sse[1]= sse8_c;
3080     c->sse[2]= sse4_c;
3081     SET_CMP_FUNC(quant_psnr)
3082     SET_CMP_FUNC(rd)
3083     SET_CMP_FUNC(bit)
3084     c->vsad[0]= vsad16_c;
3085     c->vsad[4]= vsad_intra16_c;
3086     c->vsad[5]= vsad_intra8_c;
3087     c->vsse[0]= vsse16_c;
3088     c->vsse[4]= vsse_intra16_c;
3089     c->vsse[5]= vsse_intra8_c;
3090     c->nsse[0]= nsse16_c;
3091     c->nsse[1]= nsse8_c;
3092 #if CONFIG_DWT
3093     ff_dsputil_init_dwt(c);
3094 #endif
3095
3096     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3097
3098     c->add_bytes= add_bytes_c;
3099     c->add_bytes_l2= add_bytes_l2_c;
3100     c->diff_bytes= diff_bytes_c;
3101     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3102     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3103     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
3104     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3105     c->bswap_buf= bswap_buf;
3106     c->bswap16_buf = bswap16_buf;
3107 #if CONFIG_PNG_DECODER
3108     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
3109 #endif
3110
3111     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3112         c->h263_h_loop_filter= h263_h_loop_filter_c;
3113         c->h263_v_loop_filter= h263_v_loop_filter_c;
3114     }
3115
3116     if (CONFIG_VP3_DECODER) {
3117         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3118         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3119         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3120     }
3121
3122     c->h261_loop_filter= h261_loop_filter_c;
3123
3124     c->try_8x8basis= try_8x8basis_c;
3125     c->add_8x8basis= add_8x8basis_c;
3126
3127 #if CONFIG_VORBIS_DECODER
3128     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3129 #endif
3130 #if CONFIG_AC3_DECODER
3131     c->ac3_downmix = ff_ac3_downmix_c;
3132 #endif
3133     c->vector_fmul = vector_fmul_c;
3134     c->vector_fmul_reverse = vector_fmul_reverse_c;
3135     c->vector_fmul_add = vector_fmul_add_c;
3136     c->vector_fmul_window = vector_fmul_window_c;
3137     c->vector_clipf = vector_clipf_c;
3138     c->scalarproduct_int16 = scalarproduct_int16_c;
3139     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3140     c->apply_window_int16 = apply_window_int16_c;
3141     c->vector_clip_int32 = vector_clip_int32_c;
3142     c->scalarproduct_float = scalarproduct_float_c;
3143     c->butterflies_float = butterflies_float_c;
3144     c->vector_fmul_scalar = vector_fmul_scalar_c;
3145
3146     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
3147     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
3148
3149     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
3150     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
3151
3152     c->shrink[0]= av_image_copy_plane;
3153     c->shrink[1]= ff_shrink22;
3154     c->shrink[2]= ff_shrink44;
3155     c->shrink[3]= ff_shrink88;
3156
3157     c->prefetch= just_return;
3158
3159     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3160     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3161
3162 #undef FUNC
3163 #undef FUNCC
3164 #define FUNC(f, depth) f ## _ ## depth
3165 #define FUNCC(f, depth) f ## _ ## depth ## _c
3166
3167 #define dspfunc1(PFX, IDX, NUM, depth)\
3168     c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM        , depth);\
3169     c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3170     c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3171     c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3172
3173 #define dspfunc2(PFX, IDX, NUM, depth)\
3174     c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3175     c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3176     c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3177     c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3178     c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3179     c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3180     c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3181     c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3182     c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3183     c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3184     c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3185     c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3186     c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3187     c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3188     c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3189     c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3190
3191
3192 #define BIT_DEPTH_FUNCS(depth)\
3193     c->draw_edges                    = FUNCC(draw_edges            , depth);\
3194     c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
3195     c->clear_block                   = FUNCC(clear_block           , depth);\
3196     c->clear_blocks                  = FUNCC(clear_blocks          , depth);\
3197     c->add_pixels8                   = FUNCC(add_pixels8           , depth);\
3198     c->add_pixels4                   = FUNCC(add_pixels4           , depth);\
3199     c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
3200     c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3201 \
3202     c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8   , depth);\
3203     c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4   , depth);\
3204     c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2   , depth);\
3205     c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8   , depth);\
3206     c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4   , depth);\
3207     c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2   , depth);\
3208 \
3209     dspfunc1(put       , 0, 16, depth);\
3210     dspfunc1(put       , 1,  8, depth);\
3211     dspfunc1(put       , 2,  4, depth);\
3212     dspfunc1(put       , 3,  2, depth);\
3213     dspfunc1(put_no_rnd, 0, 16, depth);\
3214     dspfunc1(put_no_rnd, 1,  8, depth);\
3215     dspfunc1(avg       , 0, 16, depth);\
3216     dspfunc1(avg       , 1,  8, depth);\
3217     dspfunc1(avg       , 2,  4, depth);\
3218     dspfunc1(avg       , 3,  2, depth);\
3219     dspfunc1(avg_no_rnd, 0, 16, depth);\
3220     dspfunc1(avg_no_rnd, 1,  8, depth);\
3221 \
3222     dspfunc2(put_h264_qpel, 0, 16, depth);\
3223     dspfunc2(put_h264_qpel, 1,  8, depth);\
3224     dspfunc2(put_h264_qpel, 2,  4, depth);\
3225     dspfunc2(put_h264_qpel, 3,  2, depth);\
3226     dspfunc2(avg_h264_qpel, 0, 16, depth);\
3227     dspfunc2(avg_h264_qpel, 1,  8, depth);\
3228     dspfunc2(avg_h264_qpel, 2,  4, depth);
3229
3230     if (avctx->codec_id != CODEC_ID_H264 || avctx->bits_per_raw_sample == 8) {
3231         BIT_DEPTH_FUNCS(8)
3232     } else {
3233         switch (avctx->bits_per_raw_sample) {
3234             case 9:
3235                 BIT_DEPTH_FUNCS(9)
3236                 break;
3237             case 10:
3238                 BIT_DEPTH_FUNCS(10)
3239                 break;
3240             default:
3241                 av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3242                 BIT_DEPTH_FUNCS(8)
3243                 break;
3244         }
3245     }
3246
3247
3248     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
3249     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
3250     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
3251     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
3252     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
3253     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
3254     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
3255     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
3256     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
3257
3258     for(i=0; i<64; i++){
3259         if(!c->put_2tap_qpel_pixels_tab[0][i])
3260             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3261         if(!c->avg_2tap_qpel_pixels_tab[0][i])
3262             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3263     }
3264
3265     c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3266     c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3267     c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3268     c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3269
3270     c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3271     c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3272     c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3273     c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3274
3275     switch(c->idct_permutation_type){
3276     case FF_NO_IDCT_PERM:
3277         for(i=0; i<64; i++)
3278             c->idct_permutation[i]= i;
3279         break;
3280     case FF_LIBMPEG2_IDCT_PERM:
3281         for(i=0; i<64; i++)
3282             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3283         break;
3284     case FF_SIMPLE_IDCT_PERM:
3285         for(i=0; i<64; i++)
3286             c->idct_permutation[i]= simple_mmx_permutation[i];
3287         break;
3288     case FF_TRANSPOSE_IDCT_PERM:
3289         for(i=0; i<64; i++)
3290             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3291         break;
3292     case FF_PARTTRANS_IDCT_PERM:
3293         for(i=0; i<64; i++)
3294             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3295         break;
3296     case FF_SSE2_IDCT_PERM:
3297         for(i=0; i<64; i++)
3298             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3299         break;
3300     default:
3301         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3302     }
3303 }
3304