git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavcore/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "lpc.h"
  40 #include "ac3dec.h"
  41 #include "vorbis.h"
  42 #include "png.h"
  43 #include "vp8dsp.h"
  44
  45 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  46 uint32_t ff_squareTbl[512] = {0, };
  47
  48 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  49 #define pb_7f (~0UL/255 * 0x7f)
  50 #define pb_80 (~0UL/255 * 0x80)
  51
  52 const uint8_t ff_zigzag_direct[64] = {
  53     0,   1,  8, 16,  9,  2,  3, 10,
  54     17, 24, 32, 25, 18, 11,  4,  5,
  55     12, 19, 26, 33, 40, 48, 41, 34,
  56     27, 20, 13,  6,  7, 14, 21, 28,
  57     35, 42, 49, 56, 57, 50, 43, 36,
  58     29, 22, 15, 23, 30, 37, 44, 51,
  59     58, 59, 52, 45, 38, 31, 39, 46,
  60     53, 60, 61, 54, 47, 55, 62, 63
  61 };
  62
  63 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  64    specification, we interleave the fields */
  65 const uint8_t ff_zigzag248_direct[64] = {
  66      0,  8,  1,  9, 16, 24,  2, 10,
  67     17, 25, 32, 40, 48, 56, 33, 41,
  68     18, 26,  3, 11,  4, 12, 19, 27,
  69     34, 42, 49, 57, 50, 58, 35, 43,
  70     20, 28,  5, 13,  6, 14, 21, 29,
  71     36, 44, 51, 59, 52, 60, 37, 45,
  72     22, 30,  7, 15, 23, 31, 38, 46,
  73     53, 61, 54, 62, 39, 47, 55, 63,
  74 };
  75
  76 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  77 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  78
  79 const uint8_t ff_alternate_horizontal_scan[64] = {
  80     0,  1,   2,  3,  8,  9, 16, 17,
  81     10, 11,  4,  5,  6,  7, 15, 14,
  82     13, 12, 19, 18, 24, 25, 32, 33,
  83     26, 27, 20, 21, 22, 23, 28, 29,
  84     30, 31, 34, 35, 40, 41, 48, 49,
  85     42, 43, 36, 37, 38, 39, 44, 45,
  86     46, 47, 50, 51, 56, 57, 58, 59,
  87     52, 53, 54, 55, 60, 61, 62, 63,
  88 };
  89
  90 const uint8_t ff_alternate_vertical_scan[64] = {
  91     0,  8,  16, 24,  1,  9,  2, 10,
  92     17, 25, 32, 40, 48, 56, 57, 49,
  93     41, 33, 26, 18,  3, 11,  4, 12,
  94     19, 27, 34, 42, 50, 58, 35, 43,
  95     51, 59, 20, 28,  5, 13,  6, 14,
  96     21, 29, 36, 44, 52, 60, 37, 45,
  97     53, 61, 22, 30,  7, 15, 23, 31,
  98     38, 46, 54, 62, 39, 47, 55, 63,
  99 };
 100
 101 /* Input permutation for the simple_idct_mmx */
 102 static const uint8_t simple_mmx_permutation[64]={
 103         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 104         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 105         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 106         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 107         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 108         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 109         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 110         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 111 };
 112
 113 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 114
 115 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 116     int i;
 117     int end;
 118
 119     st->scantable= src_scantable;
 120
 121     for(i=0; i<64; i++){
 122         int j;
 123         j = src_scantable[i];
 124         st->permutated[i] = permutation[j];
 125 #if ARCH_PPC
 126         st->inverse[j] = i;
 127 #endif
 128     }
 129
 130     end=-1;
 131     for(i=0; i<64; i++){
 132         int j;
 133         j = st->permutated[i];
 134         if(j>end) end=j;
 135         st->raster_end[i]= end;
 136     }
 137 }
 138
 139 static int pix_sum_c(uint8_t * pix, int line_size)
 140 {
 141     int s, i, j;
 142
 143     s = 0;
 144     for (i = 0; i < 16; i++) {
 145         for (j = 0; j < 16; j += 8) {
 146             s += pix[0];
 147             s += pix[1];
 148             s += pix[2];
 149             s += pix[3];
 150             s += pix[4];
 151             s += pix[5];
 152             s += pix[6];
 153             s += pix[7];
 154             pix += 8;
 155         }
 156         pix += line_size - 16;
 157     }
 158     return s;
 159 }
 160
 161 static int pix_norm1_c(uint8_t * pix, int line_size)
 162 {
 163     int s, i, j;
 164     uint32_t *sq = ff_squareTbl + 256;
 165
 166     s = 0;
 167     for (i = 0; i < 16; i++) {
 168         for (j = 0; j < 16; j += 8) {
 169 #if 0
 170             s += sq[pix[0]];
 171             s += sq[pix[1]];
 172             s += sq[pix[2]];
 173             s += sq[pix[3]];
 174             s += sq[pix[4]];
 175             s += sq[pix[5]];
 176             s += sq[pix[6]];
 177             s += sq[pix[7]];
 178 #else
 179 #if LONG_MAX > 2147483647
 180             register uint64_t x=*(uint64_t*)pix;
 181             s += sq[x&0xff];
 182             s += sq[(x>>8)&0xff];
 183             s += sq[(x>>16)&0xff];
 184             s += sq[(x>>24)&0xff];
 185             s += sq[(x>>32)&0xff];
 186             s += sq[(x>>40)&0xff];
 187             s += sq[(x>>48)&0xff];
 188             s += sq[(x>>56)&0xff];
 189 #else
 190             register uint32_t x=*(uint32_t*)pix;
 191             s += sq[x&0xff];
 192             s += sq[(x>>8)&0xff];
 193             s += sq[(x>>16)&0xff];
 194             s += sq[(x>>24)&0xff];
 195             x=*(uint32_t*)(pix+4);
 196             s += sq[x&0xff];
 197             s += sq[(x>>8)&0xff];
 198             s += sq[(x>>16)&0xff];
 199             s += sq[(x>>24)&0xff];
 200 #endif
 201 #endif
 202             pix += 8;
 203         }
 204         pix += line_size - 16;
 205     }
 206     return s;
 207 }
 208
 209 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 210     int i;
 211
 212     for(i=0; i+8<=w; i+=8){
 213         dst[i+0]= av_bswap32(src[i+0]);
 214         dst[i+1]= av_bswap32(src[i+1]);
 215         dst[i+2]= av_bswap32(src[i+2]);
 216         dst[i+3]= av_bswap32(src[i+3]);
 217         dst[i+4]= av_bswap32(src[i+4]);
 218         dst[i+5]= av_bswap32(src[i+5]);
 219         dst[i+6]= av_bswap32(src[i+6]);
 220         dst[i+7]= av_bswap32(src[i+7]);
 221     }
 222     for(;i<w; i++){
 223         dst[i+0]= av_bswap32(src[i+0]);
 224     }
 225 }
 226
 227 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 228 {
 229     int s, i;
 230     uint32_t *sq = ff_squareTbl + 256;
 231
 232     s = 0;
 233     for (i = 0; i < h; i++) {
 234         s += sq[pix1[0] - pix2[0]];
 235         s += sq[pix1[1] - pix2[1]];
 236         s += sq[pix1[2] - pix2[2]];
 237         s += sq[pix1[3] - pix2[3]];
 238         pix1 += line_size;
 239         pix2 += line_size;
 240     }
 241     return s;
 242 }
 243
 244 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 245 {
 246     int s, i;
 247     uint32_t *sq = ff_squareTbl + 256;
 248
 249     s = 0;
 250     for (i = 0; i < h; i++) {
 251         s += sq[pix1[0] - pix2[0]];
 252         s += sq[pix1[1] - pix2[1]];
 253         s += sq[pix1[2] - pix2[2]];
 254         s += sq[pix1[3] - pix2[3]];
 255         s += sq[pix1[4] - pix2[4]];
 256         s += sq[pix1[5] - pix2[5]];
 257         s += sq[pix1[6] - pix2[6]];
 258         s += sq[pix1[7] - pix2[7]];
 259         pix1 += line_size;
 260         pix2 += line_size;
 261     }
 262     return s;
 263 }
 264
 265 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 266 {
 267     int s, i;
 268     uint32_t *sq = ff_squareTbl + 256;
 269
 270     s = 0;
 271     for (i = 0; i < h; i++) {
 272         s += sq[pix1[ 0] - pix2[ 0]];
 273         s += sq[pix1[ 1] - pix2[ 1]];
 274         s += sq[pix1[ 2] - pix2[ 2]];
 275         s += sq[pix1[ 3] - pix2[ 3]];
 276         s += sq[pix1[ 4] - pix2[ 4]];
 277         s += sq[pix1[ 5] - pix2[ 5]];
 278         s += sq[pix1[ 6] - pix2[ 6]];
 279         s += sq[pix1[ 7] - pix2[ 7]];
 280         s += sq[pix1[ 8] - pix2[ 8]];
 281         s += sq[pix1[ 9] - pix2[ 9]];
 282         s += sq[pix1[10] - pix2[10]];
 283         s += sq[pix1[11] - pix2[11]];
 284         s += sq[pix1[12] - pix2[12]];
 285         s += sq[pix1[13] - pix2[13]];
 286         s += sq[pix1[14] - pix2[14]];
 287         s += sq[pix1[15] - pix2[15]];
 288
 289         pix1 += line_size;
 290         pix2 += line_size;
 291     }
 292     return s;
 293 }
 294
 295 /* draw the edges of width 'w' of an image of size width, height */
 296 //FIXME check that this is ok for mpeg4 interlaced
 297 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 298 {
 299     uint8_t *ptr, *last_line;
 300     int i;
 301
 302     last_line = buf + (height - 1) * wrap;
 303     for(i=0;i<w;i++) {
 304         /* top and bottom */
 305         memcpy(buf - (i + 1) * wrap, buf, width);
 306         memcpy(last_line + (i + 1) * wrap, last_line, width);
 307     }
 308     /* left and right */
 309     ptr = buf;
 310     for(i=0;i<height;i++) {
 311         memset(ptr - w, ptr[0], w);
 312         memset(ptr + width, ptr[width-1], w);
 313         ptr += wrap;
 314     }
 315     /* corners */
 316     for(i=0;i<w;i++) {
 317         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 318         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 319         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 320         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 321     }
 322 }
 323
 324 /**
 325  * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
 326  * @param buf destination buffer
 327  * @param src source buffer
 328  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 329  * @param block_w width of block
 330  * @param block_h height of block
 331  * @param src_x x coordinate of the top left sample of the block in the source buffer
 332  * @param src_y y coordinate of the top left sample of the block in the source buffer
 333  * @param w width of the source buffer
 334  * @param h height of the source buffer
 335  */
 336 void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
 337                                     int src_x, int src_y, int w, int h){
 338     int x, y;
 339     int start_y, start_x, end_y, end_x;
 340
 341     if(src_y>= h){
 342         src+= (h-1-src_y)*linesize;
 343         src_y=h-1;
 344     }else if(src_y<=-block_h){
 345         src+= (1-block_h-src_y)*linesize;
 346         src_y=1-block_h;
 347     }
 348     if(src_x>= w){
 349         src+= (w-1-src_x);
 350         src_x=w-1;
 351     }else if(src_x<=-block_w){
 352         src+= (1-block_w-src_x);
 353         src_x=1-block_w;
 354     }
 355
 356     start_y= FFMAX(0, -src_y);
 357     start_x= FFMAX(0, -src_x);
 358     end_y= FFMIN(block_h, h-src_y);
 359     end_x= FFMIN(block_w, w-src_x);
 360
 361     // copy existing part
 362     for(y=start_y; y<end_y; y++){
 363         for(x=start_x; x<end_x; x++){
 364             buf[x + y*linesize]= src[x + y*linesize];
 365         }
 366     }
 367
 368     //top
 369     for(y=0; y<start_y; y++){
 370         for(x=start_x; x<end_x; x++){
 371             buf[x + y*linesize]= buf[x + start_y*linesize];
 372         }
 373     }
 374
 375     //bottom
 376     for(y=end_y; y<block_h; y++){
 377         for(x=start_x; x<end_x; x++){
 378             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
 379         }
 380     }
 381
 382     for(y=0; y<block_h; y++){
 383        //left
 384         for(x=0; x<start_x; x++){
 385             buf[x + y*linesize]= buf[start_x + y*linesize];
 386         }
 387
 388        //right
 389         for(x=end_x; x<block_w; x++){
 390             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
 391         }
 392     }
 393 }
 394
 395 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 396 {
 397     int i;
 398
 399     /* read the pixels */
 400     for(i=0;i<8;i++) {
 401         block[0] = pixels[0];
 402         block[1] = pixels[1];
 403         block[2] = pixels[2];
 404         block[3] = pixels[3];
 405         block[4] = pixels[4];
 406         block[5] = pixels[5];
 407         block[6] = pixels[6];
 408         block[7] = pixels[7];
 409         pixels += line_size;
 410         block += 8;
 411     }
 412 }
 413
 414 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 415                           const uint8_t *s2, int stride){
 416     int i;
 417
 418     /* read the pixels */
 419     for(i=0;i<8;i++) {
 420         block[0] = s1[0] - s2[0];
 421         block[1] = s1[1] - s2[1];
 422         block[2] = s1[2] - s2[2];
 423         block[3] = s1[3] - s2[3];
 424         block[4] = s1[4] - s2[4];
 425         block[5] = s1[5] - s2[5];
 426         block[6] = s1[6] - s2[6];
 427         block[7] = s1[7] - s2[7];
 428         s1 += stride;
 429         s2 += stride;
 430         block += 8;
 431     }
 432 }
 433
 434
 435 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 436                                  int line_size)
 437 {
 438     int i;
 439     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 440
 441     /* read the pixels */
 442     for(i=0;i<8;i++) {
 443         pixels[0] = cm[block[0]];
 444         pixels[1] = cm[block[1]];
 445         pixels[2] = cm[block[2]];
 446         pixels[3] = cm[block[3]];
 447         pixels[4] = cm[block[4]];
 448         pixels[5] = cm[block[5]];
 449         pixels[6] = cm[block[6]];
 450         pixels[7] = cm[block[7]];
 451
 452         pixels += line_size;
 453         block += 8;
 454     }
 455 }
 456
 457 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 458                                  int line_size)
 459 {
 460     int i;
 461     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 462
 463     /* read the pixels */
 464     for(i=0;i<4;i++) {
 465         pixels[0] = cm[block[0]];
 466         pixels[1] = cm[block[1]];
 467         pixels[2] = cm[block[2]];
 468         pixels[3] = cm[block[3]];
 469
 470         pixels += line_size;
 471         block += 8;
 472     }
 473 }
 474
 475 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 476                                  int line_size)
 477 {
 478     int i;
 479     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 480
 481     /* read the pixels */
 482     for(i=0;i<2;i++) {
 483         pixels[0] = cm[block[0]];
 484         pixels[1] = cm[block[1]];
 485
 486         pixels += line_size;
 487         block += 8;
 488     }
 489 }
 490
 491 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 492                                         uint8_t *restrict pixels,
 493                                         int line_size)
 494 {
 495     int i, j;
 496
 497     for (i = 0; i < 8; i++) {
 498         for (j = 0; j < 8; j++) {
 499             if (*block < -128)
 500                 *pixels = 0;
 501             else if (*block > 127)
 502                 *pixels = 255;
 503             else
 504                 *pixels = (uint8_t)(*block + 128);
 505             block++;
 506             pixels++;
 507         }
 508         pixels += (line_size - 8);
 509     }
 510 }
 511
 512 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 513                                     int line_size)
 514 {
 515     int i;
 516
 517     /* read the pixels */
 518     for(i=0;i<8;i++) {
 519         pixels[0] = block[0];
 520         pixels[1] = block[1];
 521         pixels[2] = block[2];
 522         pixels[3] = block[3];
 523         pixels[4] = block[4];
 524         pixels[5] = block[5];
 525         pixels[6] = block[6];
 526         pixels[7] = block[7];
 527
 528         pixels += line_size;
 529         block += 8;
 530     }
 531 }
 532
 533 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 534                           int line_size)
 535 {
 536     int i;
 537     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 538
 539     /* read the pixels */
 540     for(i=0;i<8;i++) {
 541         pixels[0] = cm[pixels[0] + block[0]];
 542         pixels[1] = cm[pixels[1] + block[1]];
 543         pixels[2] = cm[pixels[2] + block[2]];
 544         pixels[3] = cm[pixels[3] + block[3]];
 545         pixels[4] = cm[pixels[4] + block[4]];
 546         pixels[5] = cm[pixels[5] + block[5]];
 547         pixels[6] = cm[pixels[6] + block[6]];
 548         pixels[7] = cm[pixels[7] + block[7]];
 549         pixels += line_size;
 550         block += 8;
 551     }
 552 }
 553
 554 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 555                           int line_size)
 556 {
 557     int i;
 558     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 559
 560     /* read the pixels */
 561     for(i=0;i<4;i++) {
 562         pixels[0] = cm[pixels[0] + block[0]];
 563         pixels[1] = cm[pixels[1] + block[1]];
 564         pixels[2] = cm[pixels[2] + block[2]];
 565         pixels[3] = cm[pixels[3] + block[3]];
 566         pixels += line_size;
 567         block += 8;
 568     }
 569 }
 570
 571 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 572                           int line_size)
 573 {
 574     int i;
 575     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 576
 577     /* read the pixels */
 578     for(i=0;i<2;i++) {
 579         pixels[0] = cm[pixels[0] + block[0]];
 580         pixels[1] = cm[pixels[1] + block[1]];
 581         pixels += line_size;
 582         block += 8;
 583     }
 584 }
 585
 586 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 587 {
 588     int i;
 589     for(i=0;i<8;i++) {
 590         pixels[0] += block[0];
 591         pixels[1] += block[1];
 592         pixels[2] += block[2];
 593         pixels[3] += block[3];
 594         pixels[4] += block[4];
 595         pixels[5] += block[5];
 596         pixels[6] += block[6];
 597         pixels[7] += block[7];
 598         pixels += line_size;
 599         block += 8;
 600     }
 601 }
 602
 603 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 604 {
 605     int i;
 606     for(i=0;i<4;i++) {
 607         pixels[0] += block[0];
 608         pixels[1] += block[1];
 609         pixels[2] += block[2];
 610         pixels[3] += block[3];
 611         pixels += line_size;
 612         block += 4;
 613     }
 614 }
 615
 616 static int sum_abs_dctelem_c(DCTELEM *block)
 617 {
 618     int sum=0, i;
 619     for(i=0; i<64; i++)
 620         sum+= FFABS(block[i]);
 621     return sum;
 622 }
 623
 624 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 625 {
 626     int i;
 627
 628     for (i = 0; i < h; i++) {
 629         memset(block, value, 16);
 630         block += line_size;
 631     }
 632 }
 633
 634 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 635 {
 636     int i;
 637
 638     for (i = 0; i < h; i++) {
 639         memset(block, value, 8);
 640         block += line_size;
 641     }
 642 }
 643
 644 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
 645 {
 646     int i, j;
 647     uint16_t *dst1 = (uint16_t *) dst;
 648     uint16_t *dst2 = (uint16_t *)(dst + linesize);
 649
 650     for (j = 0; j < 8; j++) {
 651         for (i = 0; i < 8; i++) {
 652             dst1[i] = dst2[i] = src[i] * 0x0101;
 653         }
 654         src  += 8;
 655         dst1 += linesize;
 656         dst2 += linesize;
 657     }
 658 }
 659
 660 #if 0
 661
 662 #define PIXOP2(OPNAME, OP) \
 663 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 664 {\
 665     int i;\
 666     for(i=0; i<h; i++){\
 667         OP(*((uint64_t*)block), AV_RN64(pixels));\
 668         pixels+=line_size;\
 669         block +=line_size;\
 670     }\
 671 }\
 672 \
 673 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 674 {\
 675     int i;\
 676     for(i=0; i<h; i++){\
 677         const uint64_t a= AV_RN64(pixels  );\
 678         const uint64_t b= AV_RN64(pixels+1);\
 679         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 680         pixels+=line_size;\
 681         block +=line_size;\
 682     }\
 683 }\
 684 \
 685 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 686 {\
 687     int i;\
 688     for(i=0; i<h; i++){\
 689         const uint64_t a= AV_RN64(pixels  );\
 690         const uint64_t b= AV_RN64(pixels+1);\
 691         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 692         pixels+=line_size;\
 693         block +=line_size;\
 694     }\
 695 }\
 696 \
 697 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 698 {\
 699     int i;\
 700     for(i=0; i<h; i++){\
 701         const uint64_t a= AV_RN64(pixels          );\
 702         const uint64_t b= AV_RN64(pixels+line_size);\
 703         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 704         pixels+=line_size;\
 705         block +=line_size;\
 706     }\
 707 }\
 708 \
 709 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 710 {\
 711     int i;\
 712     for(i=0; i<h; i++){\
 713         const uint64_t a= AV_RN64(pixels          );\
 714         const uint64_t b= AV_RN64(pixels+line_size);\
 715         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 716         pixels+=line_size;\
 717         block +=line_size;\
 718     }\
 719 }\
 720 \
 721 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 722 {\
 723         int i;\
 724         const uint64_t a= AV_RN64(pixels  );\
 725         const uint64_t b= AV_RN64(pixels+1);\
 726         uint64_t l0=  (a&0x0303030303030303ULL)\
 727                     + (b&0x0303030303030303ULL)\
 728                     + 0x0202020202020202ULL;\
 729         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 730                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 731         uint64_t l1,h1;\
 732 \
 733         pixels+=line_size;\
 734         for(i=0; i<h; i+=2){\
 735             uint64_t a= AV_RN64(pixels  );\
 736             uint64_t b= AV_RN64(pixels+1);\
 737             l1=  (a&0x0303030303030303ULL)\
 738                + (b&0x0303030303030303ULL);\
 739             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 740               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 741             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 742             pixels+=line_size;\
 743             block +=line_size;\
 744             a= AV_RN64(pixels  );\
 745             b= AV_RN64(pixels+1);\
 746             l0=  (a&0x0303030303030303ULL)\
 747                + (b&0x0303030303030303ULL)\
 748                + 0x0202020202020202ULL;\
 749             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 750               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 751             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 752             pixels+=line_size;\
 753             block +=line_size;\
 754         }\
 755 }\
 756 \
 757 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 758 {\
 759         int i;\
 760         const uint64_t a= AV_RN64(pixels  );\
 761         const uint64_t b= AV_RN64(pixels+1);\
 762         uint64_t l0=  (a&0x0303030303030303ULL)\
 763                     + (b&0x0303030303030303ULL)\
 764                     + 0x0101010101010101ULL;\
 765         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 766                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 767         uint64_t l1,h1;\
 768 \
 769         pixels+=line_size;\
 770         for(i=0; i<h; i+=2){\
 771             uint64_t a= AV_RN64(pixels  );\
 772             uint64_t b= AV_RN64(pixels+1);\
 773             l1=  (a&0x0303030303030303ULL)\
 774                + (b&0x0303030303030303ULL);\
 775             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 776               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 777             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 778             pixels+=line_size;\
 779             block +=line_size;\
 780             a= AV_RN64(pixels  );\
 781             b= AV_RN64(pixels+1);\
 782             l0=  (a&0x0303030303030303ULL)\
 783                + (b&0x0303030303030303ULL)\
 784                + 0x0101010101010101ULL;\
 785             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 786               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 787             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 788             pixels+=line_size;\
 789             block +=line_size;\
 790         }\
 791 }\
 792 \
 793 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 794 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 795 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 796 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 797 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 798 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 799 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 800
 801 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 802 #else // 64 bit variant
 803
 804 #define PIXOP2(OPNAME, OP) \
 805 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 806     int i;\
 807     for(i=0; i<h; i++){\
 808         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 809         pixels+=line_size;\
 810         block +=line_size;\
 811     }\
 812 }\
 813 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 814     int i;\
 815     for(i=0; i<h; i++){\
 816         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 817         pixels+=line_size;\
 818         block +=line_size;\
 819     }\
 820 }\
 821 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 822     int i;\
 823     for(i=0; i<h; i++){\
 824         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 825         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 826         pixels+=line_size;\
 827         block +=line_size;\
 828     }\
 829 }\
 830 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 831     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 832 }\
 833 \
 834 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 835                                                 int src_stride1, int src_stride2, int h){\
 836     int i;\
 837     for(i=0; i<h; i++){\
 838         uint32_t a,b;\
 839         a= AV_RN32(&src1[i*src_stride1  ]);\
 840         b= AV_RN32(&src2[i*src_stride2  ]);\
 841         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 842         a= AV_RN32(&src1[i*src_stride1+4]);\
 843         b= AV_RN32(&src2[i*src_stride2+4]);\
 844         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 845     }\
 846 }\
 847 \
 848 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 849                                                 int src_stride1, int src_stride2, int h){\
 850     int i;\
 851     for(i=0; i<h; i++){\
 852         uint32_t a,b;\
 853         a= AV_RN32(&src1[i*src_stride1  ]);\
 854         b= AV_RN32(&src2[i*src_stride2  ]);\
 855         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 856         a= AV_RN32(&src1[i*src_stride1+4]);\
 857         b= AV_RN32(&src2[i*src_stride2+4]);\
 858         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 859     }\
 860 }\
 861 \
 862 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 863                                                 int src_stride1, int src_stride2, int h){\
 864     int i;\
 865     for(i=0; i<h; i++){\
 866         uint32_t a,b;\
 867         a= AV_RN32(&src1[i*src_stride1  ]);\
 868         b= AV_RN32(&src2[i*src_stride2  ]);\
 869         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 870     }\
 871 }\
 872 \
 873 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 874                                                 int src_stride1, int src_stride2, int h){\
 875     int i;\
 876     for(i=0; i<h; i++){\
 877         uint32_t a,b;\
 878         a= AV_RN16(&src1[i*src_stride1  ]);\
 879         b= AV_RN16(&src2[i*src_stride2  ]);\
 880         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 881     }\
 882 }\
 883 \
 884 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 885                                                 int src_stride1, int src_stride2, int h){\
 886     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 887     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 888 }\
 889 \
 890 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 891                                                 int src_stride1, int src_stride2, int h){\
 892     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 893     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 894 }\
 895 \
 896 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 897     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 898 }\
 899 \
 900 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 901     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 902 }\
 903 \
 904 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 905     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 906 }\
 907 \
 908 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 909     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 910 }\
 911 \
 912 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 913                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 914     int i;\
 915     for(i=0; i<h; i++){\
 916         uint32_t a, b, c, d, l0, l1, h0, h1;\
 917         a= AV_RN32(&src1[i*src_stride1]);\
 918         b= AV_RN32(&src2[i*src_stride2]);\
 919         c= AV_RN32(&src3[i*src_stride3]);\
 920         d= AV_RN32(&src4[i*src_stride4]);\
 921         l0=  (a&0x03030303UL)\
 922            + (b&0x03030303UL)\
 923            + 0x02020202UL;\
 924         h0= ((a&0xFCFCFCFCUL)>>2)\
 925           + ((b&0xFCFCFCFCUL)>>2);\
 926         l1=  (c&0x03030303UL)\
 927            + (d&0x03030303UL);\
 928         h1= ((c&0xFCFCFCFCUL)>>2)\
 929           + ((d&0xFCFCFCFCUL)>>2);\
 930         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 931         a= AV_RN32(&src1[i*src_stride1+4]);\
 932         b= AV_RN32(&src2[i*src_stride2+4]);\
 933         c= AV_RN32(&src3[i*src_stride3+4]);\
 934         d= AV_RN32(&src4[i*src_stride4+4]);\
 935         l0=  (a&0x03030303UL)\
 936            + (b&0x03030303UL)\
 937            + 0x02020202UL;\
 938         h0= ((a&0xFCFCFCFCUL)>>2)\
 939           + ((b&0xFCFCFCFCUL)>>2);\
 940         l1=  (c&0x03030303UL)\
 941            + (d&0x03030303UL);\
 942         h1= ((c&0xFCFCFCFCUL)>>2)\
 943           + ((d&0xFCFCFCFCUL)>>2);\
 944         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 945     }\
 946 }\
 947 \
 948 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 949     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 950 }\
 951 \
 952 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 953     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 954 }\
 955 \
 956 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 957     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 958 }\
 959 \
 960 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 961     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 962 }\
 963 \
 964 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 965                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 966     int i;\
 967     for(i=0; i<h; i++){\
 968         uint32_t a, b, c, d, l0, l1, h0, h1;\
 969         a= AV_RN32(&src1[i*src_stride1]);\
 970         b= AV_RN32(&src2[i*src_stride2]);\
 971         c= AV_RN32(&src3[i*src_stride3]);\
 972         d= AV_RN32(&src4[i*src_stride4]);\
 973         l0=  (a&0x03030303UL)\
 974            + (b&0x03030303UL)\
 975            + 0x01010101UL;\
 976         h0= ((a&0xFCFCFCFCUL)>>2)\
 977           + ((b&0xFCFCFCFCUL)>>2);\
 978         l1=  (c&0x03030303UL)\
 979            + (d&0x03030303UL);\
 980         h1= ((c&0xFCFCFCFCUL)>>2)\
 981           + ((d&0xFCFCFCFCUL)>>2);\
 982         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 983         a= AV_RN32(&src1[i*src_stride1+4]);\
 984         b= AV_RN32(&src2[i*src_stride2+4]);\
 985         c= AV_RN32(&src3[i*src_stride3+4]);\
 986         d= AV_RN32(&src4[i*src_stride4+4]);\
 987         l0=  (a&0x03030303UL)\
 988            + (b&0x03030303UL)\
 989            + 0x01010101UL;\
 990         h0= ((a&0xFCFCFCFCUL)>>2)\
 991           + ((b&0xFCFCFCFCUL)>>2);\
 992         l1=  (c&0x03030303UL)\
 993            + (d&0x03030303UL);\
 994         h1= ((c&0xFCFCFCFCUL)>>2)\
 995           + ((d&0xFCFCFCFCUL)>>2);\
 996         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 997     }\
 998 }\
 999 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1000                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1001     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1002     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1003 }\
1004 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1005                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1006     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1007     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1008 }\
1009 \
1010 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1011 {\
1012         int i, a0, b0, a1, b1;\
1013         a0= pixels[0];\
1014         b0= pixels[1] + 2;\
1015         a0 += b0;\
1016         b0 += pixels[2];\
1017 \
1018         pixels+=line_size;\
1019         for(i=0; i<h; i+=2){\
1020             a1= pixels[0];\
1021             b1= pixels[1];\
1022             a1 += b1;\
1023             b1 += pixels[2];\
1024 \
1025             block[0]= (a1+a0)>>2; /* FIXME non put */\
1026             block[1]= (b1+b0)>>2;\
1027 \
1028             pixels+=line_size;\
1029             block +=line_size;\
1030 \
1031             a0= pixels[0];\
1032             b0= pixels[1] + 2;\
1033             a0 += b0;\
1034             b0 += pixels[2];\
1035 \
1036             block[0]= (a1+a0)>>2;\
1037             block[1]= (b1+b0)>>2;\
1038             pixels+=line_size;\
1039             block +=line_size;\
1040         }\
1041 }\
1042 \
1043 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1044 {\
1045         int i;\
1046         const uint32_t a= AV_RN32(pixels  );\
1047         const uint32_t b= AV_RN32(pixels+1);\
1048         uint32_t l0=  (a&0x03030303UL)\
1049                     + (b&0x03030303UL)\
1050                     + 0x02020202UL;\
1051         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1052                    + ((b&0xFCFCFCFCUL)>>2);\
1053         uint32_t l1,h1;\
1054 \
1055         pixels+=line_size;\
1056         for(i=0; i<h; i+=2){\
1057             uint32_t a= AV_RN32(pixels  );\
1058             uint32_t b= AV_RN32(pixels+1);\
1059             l1=  (a&0x03030303UL)\
1060                + (b&0x03030303UL);\
1061             h1= ((a&0xFCFCFCFCUL)>>2)\
1062               + ((b&0xFCFCFCFCUL)>>2);\
1063             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1064             pixels+=line_size;\
1065             block +=line_size;\
1066             a= AV_RN32(pixels  );\
1067             b= AV_RN32(pixels+1);\
1068             l0=  (a&0x03030303UL)\
1069                + (b&0x03030303UL)\
1070                + 0x02020202UL;\
1071             h0= ((a&0xFCFCFCFCUL)>>2)\
1072               + ((b&0xFCFCFCFCUL)>>2);\
1073             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1074             pixels+=line_size;\
1075             block +=line_size;\
1076         }\
1077 }\
1078 \
1079 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1080 {\
1081     int j;\
1082     for(j=0; j<2; j++){\
1083         int i;\
1084         const uint32_t a= AV_RN32(pixels  );\
1085         const uint32_t b= AV_RN32(pixels+1);\
1086         uint32_t l0=  (a&0x03030303UL)\
1087                     + (b&0x03030303UL)\
1088                     + 0x02020202UL;\
1089         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1090                    + ((b&0xFCFCFCFCUL)>>2);\
1091         uint32_t l1,h1;\
1092 \
1093         pixels+=line_size;\
1094         for(i=0; i<h; i+=2){\
1095             uint32_t a= AV_RN32(pixels  );\
1096             uint32_t b= AV_RN32(pixels+1);\
1097             l1=  (a&0x03030303UL)\
1098                + (b&0x03030303UL);\
1099             h1= ((a&0xFCFCFCFCUL)>>2)\
1100               + ((b&0xFCFCFCFCUL)>>2);\
1101             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1102             pixels+=line_size;\
1103             block +=line_size;\
1104             a= AV_RN32(pixels  );\
1105             b= AV_RN32(pixels+1);\
1106             l0=  (a&0x03030303UL)\
1107                + (b&0x03030303UL)\
1108                + 0x02020202UL;\
1109             h0= ((a&0xFCFCFCFCUL)>>2)\
1110               + ((b&0xFCFCFCFCUL)>>2);\
1111             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1112             pixels+=line_size;\
1113             block +=line_size;\
1114         }\
1115         pixels+=4-line_size*(h+1);\
1116         block +=4-line_size*h;\
1117     }\
1118 }\
1119 \
1120 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1121 {\
1122     int j;\
1123     for(j=0; j<2; j++){\
1124         int i;\
1125         const uint32_t a= AV_RN32(pixels  );\
1126         const uint32_t b= AV_RN32(pixels+1);\
1127         uint32_t l0=  (a&0x03030303UL)\
1128                     + (b&0x03030303UL)\
1129                     + 0x01010101UL;\
1130         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1131                    + ((b&0xFCFCFCFCUL)>>2);\
1132         uint32_t l1,h1;\
1133 \
1134         pixels+=line_size;\
1135         for(i=0; i<h; i+=2){\
1136             uint32_t a= AV_RN32(pixels  );\
1137             uint32_t b= AV_RN32(pixels+1);\
1138             l1=  (a&0x03030303UL)\
1139                + (b&0x03030303UL);\
1140             h1= ((a&0xFCFCFCFCUL)>>2)\
1141               + ((b&0xFCFCFCFCUL)>>2);\
1142             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1143             pixels+=line_size;\
1144             block +=line_size;\
1145             a= AV_RN32(pixels  );\
1146             b= AV_RN32(pixels+1);\
1147             l0=  (a&0x03030303UL)\
1148                + (b&0x03030303UL)\
1149                + 0x01010101UL;\
1150             h0= ((a&0xFCFCFCFCUL)>>2)\
1151               + ((b&0xFCFCFCFCUL)>>2);\
1152             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1153             pixels+=line_size;\
1154             block +=line_size;\
1155         }\
1156         pixels+=4-line_size*(h+1);\
1157         block +=4-line_size*h;\
1158     }\
1159 }\
1160 \
1161 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1162 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1163 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1164 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1165 av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1166 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1167 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1168 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1169
1170 #define op_avg(a, b) a = rnd_avg32(a, b)
1171 #endif
1172 #define op_put(a, b) a = b
1173
1174 PIXOP2(avg, op_avg)
1175 PIXOP2(put, op_put)
1176 #undef op_avg
1177 #undef op_put
1178
1179 #define put_no_rnd_pixels8_c  put_pixels8_c
1180 #define put_no_rnd_pixels16_c put_pixels16_c
1181
1182 #define avg2(a,b) ((a+b+1)>>1)
1183 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1184
1185 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1186     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1187 }
1188
1189 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1190     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1191 }
1192
1193 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1194 {
1195     const int A=(16-x16)*(16-y16);
1196     const int B=(   x16)*(16-y16);
1197     const int C=(16-x16)*(   y16);
1198     const int D=(   x16)*(   y16);
1199     int i;
1200
1201     for(i=0; i<h; i++)
1202     {
1203         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1204         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1205         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1206         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1207         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1208         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1209         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1210         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1211         dst+= stride;
1212         src+= stride;
1213     }
1214 }
1215
1216 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1217                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1218 {
1219     int y, vx, vy;
1220     const int s= 1<<shift;
1221
1222     width--;
1223     height--;
1224
1225     for(y=0; y<h; y++){
1226         int x;
1227
1228         vx= ox;
1229         vy= oy;
1230         for(x=0; x<8; x++){ //XXX FIXME optimize
1231             int src_x, src_y, frac_x, frac_y, index;
1232
1233             src_x= vx>>16;
1234             src_y= vy>>16;
1235             frac_x= src_x&(s-1);
1236             frac_y= src_y&(s-1);
1237             src_x>>=shift;
1238             src_y>>=shift;
1239
1240             if((unsigned)src_x < width){
1241                 if((unsigned)src_y < height){
1242                     index= src_x + src_y*stride;
1243                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1244                                            + src[index       +1]*   frac_x )*(s-frac_y)
1245                                         + (  src[index+stride  ]*(s-frac_x)
1246                                            + src[index+stride+1]*   frac_x )*   frac_y
1247                                         + r)>>(shift*2);
1248                 }else{
1249                     index= src_x + av_clip(src_y, 0, height)*stride;
1250                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1251                                           + src[index       +1]*   frac_x )*s
1252                                         + r)>>(shift*2);
1253                 }
1254             }else{
1255                 if((unsigned)src_y < height){
1256                     index= av_clip(src_x, 0, width) + src_y*stride;
1257                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1258                                            + src[index+stride  ]*   frac_y )*s
1259                                         + r)>>(shift*2);
1260                 }else{
1261                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1262                     dst[y*stride + x]=    src[index         ];
1263                 }
1264             }
1265
1266             vx+= dxx;
1267             vy+= dyx;
1268         }
1269         ox += dxy;
1270         oy += dyy;
1271     }
1272 }
1273
1274 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1275     switch(width){
1276     case 2: put_pixels2_c (dst, src, stride, height); break;
1277     case 4: put_pixels4_c (dst, src, stride, height); break;
1278     case 8: put_pixels8_c (dst, src, stride, height); break;
1279     case 16:put_pixels16_c(dst, src, stride, height); break;
1280     }
1281 }
1282
1283 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1284     int i,j;
1285     for (i=0; i < height; i++) {
1286       for (j=0; j < width; j++) {
1287         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1288       }
1289       src += stride;
1290       dst += stride;
1291     }
1292 }
1293
1294 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1295     int i,j;
1296     for (i=0; i < height; i++) {
1297       for (j=0; j < width; j++) {
1298         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1299       }
1300       src += stride;
1301       dst += stride;
1302     }
1303 }
1304
1305 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1306     int i,j;
1307     for (i=0; i < height; i++) {
1308       for (j=0; j < width; j++) {
1309         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1310       }
1311       src += stride;
1312       dst += stride;
1313     }
1314 }
1315
1316 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1317     int i,j;
1318     for (i=0; i < height; i++) {
1319       for (j=0; j < width; j++) {
1320         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1321       }
1322       src += stride;
1323       dst += stride;
1324     }
1325 }
1326
1327 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1328     int i,j;
1329     for (i=0; i < height; i++) {
1330       for (j=0; j < width; j++) {
1331         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1332       }
1333       src += stride;
1334       dst += stride;
1335     }
1336 }
1337
1338 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1339     int i,j;
1340     for (i=0; i < height; i++) {
1341       for (j=0; j < width; j++) {
1342         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1343       }
1344       src += stride;
1345       dst += stride;
1346     }
1347 }
1348
1349 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1350     int i,j;
1351     for (i=0; i < height; i++) {
1352       for (j=0; j < width; j++) {
1353         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1354       }
1355       src += stride;
1356       dst += stride;
1357     }
1358 }
1359
1360 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1361     int i,j;
1362     for (i=0; i < height; i++) {
1363       for (j=0; j < width; j++) {
1364         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1365       }
1366       src += stride;
1367       dst += stride;
1368     }
1369 }
1370
1371 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1372     switch(width){
1373     case 2: avg_pixels2_c (dst, src, stride, height); break;
1374     case 4: avg_pixels4_c (dst, src, stride, height); break;
1375     case 8: avg_pixels8_c (dst, src, stride, height); break;
1376     case 16:avg_pixels16_c(dst, src, stride, height); break;
1377     }
1378 }
1379
1380 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1381     int i,j;
1382     for (i=0; i < height; i++) {
1383       for (j=0; j < width; j++) {
1384         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1385       }
1386       src += stride;
1387       dst += stride;
1388     }
1389 }
1390
1391 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1392     int i,j;
1393     for (i=0; i < height; i++) {
1394       for (j=0; j < width; j++) {
1395         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1396       }
1397       src += stride;
1398       dst += stride;
1399     }
1400 }
1401
1402 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1403     int i,j;
1404     for (i=0; i < height; i++) {
1405       for (j=0; j < width; j++) {
1406         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1407       }
1408       src += stride;
1409       dst += stride;
1410     }
1411 }
1412
1413 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1414     int i,j;
1415     for (i=0; i < height; i++) {
1416       for (j=0; j < width; j++) {
1417         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1418       }
1419       src += stride;
1420       dst += stride;
1421     }
1422 }
1423
1424 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1425     int i,j;
1426     for (i=0; i < height; i++) {
1427       for (j=0; j < width; j++) {
1428         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1429       }
1430       src += stride;
1431       dst += stride;
1432     }
1433 }
1434
1435 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1436     int i,j;
1437     for (i=0; i < height; i++) {
1438       for (j=0; j < width; j++) {
1439         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1440       }
1441       src += stride;
1442       dst += stride;
1443     }
1444 }
1445
1446 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1447     int i,j;
1448     for (i=0; i < height; i++) {
1449       for (j=0; j < width; j++) {
1450         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1451       }
1452       src += stride;
1453       dst += stride;
1454     }
1455 }
1456
1457 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1458     int i,j;
1459     for (i=0; i < height; i++) {
1460       for (j=0; j < width; j++) {
1461         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1462       }
1463       src += stride;
1464       dst += stride;
1465     }
1466 }
1467 #if 0
1468 #define TPEL_WIDTH(width)\
1469 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1470     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1471 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1472     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1473 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1474     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1475 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1476     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1477 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1478     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1479 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1480     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1481 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1482     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1483 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1484     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1485 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1486     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1487 #endif
1488
1489 #define H264_CHROMA_MC(OPNAME, OP)\
1490 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1491     const int A=(8-x)*(8-y);\
1492     const int B=(  x)*(8-y);\
1493     const int C=(8-x)*(  y);\
1494     const int D=(  x)*(  y);\
1495     int i;\
1496     \
1497     assert(x<8 && y<8 && x>=0 && y>=0);\
1498 \
1499     if(D){\
1500         for(i=0; i<h; i++){\
1501             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1502             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1503             dst+= stride;\
1504             src+= stride;\
1505         }\
1506     }else{\
1507         const int E= B+C;\
1508         const int step= C ? stride : 1;\
1509         for(i=0; i<h; i++){\
1510             OP(dst[0], (A*src[0] + E*src[step+0]));\
1511             OP(dst[1], (A*src[1] + E*src[step+1]));\
1512             dst+= stride;\
1513             src+= stride;\
1514         }\
1515     }\
1516 }\
1517 \
1518 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1519     const int A=(8-x)*(8-y);\
1520     const int B=(  x)*(8-y);\
1521     const int C=(8-x)*(  y);\
1522     const int D=(  x)*(  y);\
1523     int i;\
1524     \
1525     assert(x<8 && y<8 && x>=0 && y>=0);\
1526 \
1527     if(D){\
1528         for(i=0; i<h; i++){\
1529             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1530             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1531             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1532             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1533             dst+= stride;\
1534             src+= stride;\
1535         }\
1536     }else{\
1537         const int E= B+C;\
1538         const int step= C ? stride : 1;\
1539         for(i=0; i<h; i++){\
1540             OP(dst[0], (A*src[0] + E*src[step+0]));\
1541             OP(dst[1], (A*src[1] + E*src[step+1]));\
1542             OP(dst[2], (A*src[2] + E*src[step+2]));\
1543             OP(dst[3], (A*src[3] + E*src[step+3]));\
1544             dst+= stride;\
1545             src+= stride;\
1546         }\
1547     }\
1548 }\
1549 \
1550 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1551     const int A=(8-x)*(8-y);\
1552     const int B=(  x)*(8-y);\
1553     const int C=(8-x)*(  y);\
1554     const int D=(  x)*(  y);\
1555     int i;\
1556     \
1557     assert(x<8 && y<8 && x>=0 && y>=0);\
1558 \
1559     if(D){\
1560         for(i=0; i<h; i++){\
1561             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1562             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1563             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1564             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1565             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1566             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1567             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1568             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1569             dst+= stride;\
1570             src+= stride;\
1571         }\
1572     }else{\
1573         const int E= B+C;\
1574         const int step= C ? stride : 1;\
1575         for(i=0; i<h; i++){\
1576             OP(dst[0], (A*src[0] + E*src[step+0]));\
1577             OP(dst[1], (A*src[1] + E*src[step+1]));\
1578             OP(dst[2], (A*src[2] + E*src[step+2]));\
1579             OP(dst[3], (A*src[3] + E*src[step+3]));\
1580             OP(dst[4], (A*src[4] + E*src[step+4]));\
1581             OP(dst[5], (A*src[5] + E*src[step+5]));\
1582             OP(dst[6], (A*src[6] + E*src[step+6]));\
1583             OP(dst[7], (A*src[7] + E*src[step+7]));\
1584             dst+= stride;\
1585             src+= stride;\
1586         }\
1587     }\
1588 }
1589
1590 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1591 #define op_put(a, b) a = (((b) + 32)>>6)
1592
1593 H264_CHROMA_MC(put_       , op_put)
1594 H264_CHROMA_MC(avg_       , op_avg)
1595 #undef op_avg
1596 #undef op_put
1597
1598 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1599     const int A=(8-x)*(8-y);
1600     const int B=(  x)*(8-y);
1601     const int C=(8-x)*(  y);
1602     const int D=(  x)*(  y);
1603     int i;
1604
1605     assert(x<8 && y<8 && x>=0 && y>=0);
1606
1607     for(i=0; i<h; i++)
1608     {
1609         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1610         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1611         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1612         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1613         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1614         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1615         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1616         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1617         dst+= stride;
1618         src+= stride;
1619     }
1620 }
1621
1622 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1623     const int A=(8-x)*(8-y);
1624     const int B=(  x)*(8-y);
1625     const int C=(8-x)*(  y);
1626     const int D=(  x)*(  y);
1627     int i;
1628
1629     assert(x<8 && y<8 && x>=0 && y>=0);
1630
1631     for(i=0; i<h; i++)
1632     {
1633         dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1634         dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1635         dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1636         dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1637         dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1638         dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1639         dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1640         dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1641         dst+= stride;
1642         src+= stride;
1643     }
1644 }
1645
1646 #define QPEL_MC(r, OPNAME, RND, OP) \
1647 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1648     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1649     int i;\
1650     for(i=0; i<h; i++)\
1651     {\
1652         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1653         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1654         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1655         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1656         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1657         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1658         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1659         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1660         dst+=dstStride;\
1661         src+=srcStride;\
1662     }\
1663 }\
1664 \
1665 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1666     const int w=8;\
1667     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1668     int i;\
1669     for(i=0; i<w; i++)\
1670     {\
1671         const int src0= src[0*srcStride];\
1672         const int src1= src[1*srcStride];\
1673         const int src2= src[2*srcStride];\
1674         const int src3= src[3*srcStride];\
1675         const int src4= src[4*srcStride];\
1676         const int src5= src[5*srcStride];\
1677         const int src6= src[6*srcStride];\
1678         const int src7= src[7*srcStride];\
1679         const int src8= src[8*srcStride];\
1680         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1681         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1682         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1683         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1684         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1685         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1686         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1687         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1688         dst++;\
1689         src++;\
1690     }\
1691 }\
1692 \
1693 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1694     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1695     int i;\
1696     \
1697     for(i=0; i<h; i++)\
1698     {\
1699         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1700         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1701         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1702         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1703         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1704         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1705         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1706         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1707         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1708         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1709         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1710         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1711         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1712         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1713         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1714         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1715         dst+=dstStride;\
1716         src+=srcStride;\
1717     }\
1718 }\
1719 \
1720 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1721     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1722     int i;\
1723     const int w=16;\
1724     for(i=0; i<w; i++)\
1725     {\
1726         const int src0= src[0*srcStride];\
1727         const int src1= src[1*srcStride];\
1728         const int src2= src[2*srcStride];\
1729         const int src3= src[3*srcStride];\
1730         const int src4= src[4*srcStride];\
1731         const int src5= src[5*srcStride];\
1732         const int src6= src[6*srcStride];\
1733         const int src7= src[7*srcStride];\
1734         const int src8= src[8*srcStride];\
1735         const int src9= src[9*srcStride];\
1736         const int src10= src[10*srcStride];\
1737         const int src11= src[11*srcStride];\
1738         const int src12= src[12*srcStride];\
1739         const int src13= src[13*srcStride];\
1740         const int src14= src[14*srcStride];\
1741         const int src15= src[15*srcStride];\
1742         const int src16= src[16*srcStride];\
1743         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1744         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1745         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1746         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1747         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1748         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1749         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1750         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1751         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1752         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1753         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1754         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1755         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1756         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1757         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1758         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1759         dst++;\
1760         src++;\
1761     }\
1762 }\
1763 \
1764 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1765     uint8_t half[64];\
1766     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1767     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1768 }\
1769 \
1770 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1771     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1772 }\
1773 \
1774 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1775     uint8_t half[64];\
1776     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1777     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1778 }\
1779 \
1780 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1781     uint8_t full[16*9];\
1782     uint8_t half[64];\
1783     copy_block9(full, src, 16, stride, 9);\
1784     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1785     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1786 }\
1787 \
1788 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1789     uint8_t full[16*9];\
1790     copy_block9(full, src, 16, stride, 9);\
1791     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1792 }\
1793 \
1794 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1795     uint8_t full[16*9];\
1796     uint8_t half[64];\
1797     copy_block9(full, src, 16, stride, 9);\
1798     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1799     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1800 }\
1801 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1802     uint8_t full[16*9];\
1803     uint8_t halfH[72];\
1804     uint8_t halfV[64];\
1805     uint8_t halfHV[64];\
1806     copy_block9(full, src, 16, stride, 9);\
1807     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1809     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1810     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1811 }\
1812 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1813     uint8_t full[16*9];\
1814     uint8_t halfH[72];\
1815     uint8_t halfHV[64];\
1816     copy_block9(full, src, 16, stride, 9);\
1817     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1818     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1819     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1820     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1821 }\
1822 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1823     uint8_t full[16*9];\
1824     uint8_t halfH[72];\
1825     uint8_t halfV[64];\
1826     uint8_t halfHV[64];\
1827     copy_block9(full, src, 16, stride, 9);\
1828     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1829     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1830     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1831     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1832 }\
1833 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1834     uint8_t full[16*9];\
1835     uint8_t halfH[72];\
1836     uint8_t halfHV[64];\
1837     copy_block9(full, src, 16, stride, 9);\
1838     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1839     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1840     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1841     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1842 }\
1843 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1844     uint8_t full[16*9];\
1845     uint8_t halfH[72];\
1846     uint8_t halfV[64];\
1847     uint8_t halfHV[64];\
1848     copy_block9(full, src, 16, stride, 9);\
1849     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1850     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1851     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1852     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1853 }\
1854 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1855     uint8_t full[16*9];\
1856     uint8_t halfH[72];\
1857     uint8_t halfHV[64];\
1858     copy_block9(full, src, 16, stride, 9);\
1859     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1860     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1861     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1862     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1863 }\
1864 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1865     uint8_t full[16*9];\
1866     uint8_t halfH[72];\
1867     uint8_t halfV[64];\
1868     uint8_t halfHV[64];\
1869     copy_block9(full, src, 16, stride, 9);\
1870     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1871     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1872     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1873     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1874 }\
1875 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1876     uint8_t full[16*9];\
1877     uint8_t halfH[72];\
1878     uint8_t halfHV[64];\
1879     copy_block9(full, src, 16, stride, 9);\
1880     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1881     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1882     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1883     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1884 }\
1885 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1886     uint8_t halfH[72];\
1887     uint8_t halfHV[64];\
1888     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1889     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1890     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1891 }\
1892 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1893     uint8_t halfH[72];\
1894     uint8_t halfHV[64];\
1895     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1896     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1897     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1898 }\
1899 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1900     uint8_t full[16*9];\
1901     uint8_t halfH[72];\
1902     uint8_t halfV[64];\
1903     uint8_t halfHV[64];\
1904     copy_block9(full, src, 16, stride, 9);\
1905     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1906     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1907     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1908     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1909 }\
1910 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1911     uint8_t full[16*9];\
1912     uint8_t halfH[72];\
1913     copy_block9(full, src, 16, stride, 9);\
1914     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1915     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1916     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1917 }\
1918 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1919     uint8_t full[16*9];\
1920     uint8_t halfH[72];\
1921     uint8_t halfV[64];\
1922     uint8_t halfHV[64];\
1923     copy_block9(full, src, 16, stride, 9);\
1924     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1925     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1926     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1927     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1928 }\
1929 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1930     uint8_t full[16*9];\
1931     uint8_t halfH[72];\
1932     copy_block9(full, src, 16, stride, 9);\
1933     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1934     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1935     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1936 }\
1937 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1938     uint8_t halfH[72];\
1939     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1940     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1941 }\
1942 \
1943 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1944     uint8_t half[256];\
1945     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1946     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1947 }\
1948 \
1949 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1950     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1951 }\
1952 \
1953 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1954     uint8_t half[256];\
1955     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1956     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1957 }\
1958 \
1959 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1960     uint8_t full[24*17];\
1961     uint8_t half[256];\
1962     copy_block17(full, src, 24, stride, 17);\
1963     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1964     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1965 }\
1966 \
1967 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1968     uint8_t full[24*17];\
1969     copy_block17(full, src, 24, stride, 17);\
1970     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1971 }\
1972 \
1973 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1974     uint8_t full[24*17];\
1975     uint8_t half[256];\
1976     copy_block17(full, src, 24, stride, 17);\
1977     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1978     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1979 }\
1980 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1981     uint8_t full[24*17];\
1982     uint8_t halfH[272];\
1983     uint8_t halfV[256];\
1984     uint8_t halfHV[256];\
1985     copy_block17(full, src, 24, stride, 17);\
1986     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1987     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1988     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1989     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1990 }\
1991 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1992     uint8_t full[24*17];\
1993     uint8_t halfH[272];\
1994     uint8_t halfHV[256];\
1995     copy_block17(full, src, 24, stride, 17);\
1996     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1997     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1998     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1999     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2000 }\
2001 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2002     uint8_t full[24*17];\
2003     uint8_t halfH[272];\
2004     uint8_t halfV[256];\
2005     uint8_t halfHV[256];\
2006     copy_block17(full, src, 24, stride, 17);\
2007     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2008     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2009     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2010     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2011 }\
2012 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2013     uint8_t full[24*17];\
2014     uint8_t halfH[272];\
2015     uint8_t halfHV[256];\
2016     copy_block17(full, src, 24, stride, 17);\
2017     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2018     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2019     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2020     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2021 }\
2022 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2023     uint8_t full[24*17];\
2024     uint8_t halfH[272];\
2025     uint8_t halfV[256];\
2026     uint8_t halfHV[256];\
2027     copy_block17(full, src, 24, stride, 17);\
2028     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2029     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2030     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2031     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2032 }\
2033 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2034     uint8_t full[24*17];\
2035     uint8_t halfH[272];\
2036     uint8_t halfHV[256];\
2037     copy_block17(full, src, 24, stride, 17);\
2038     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2039     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2040     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2041     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2042 }\
2043 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2044     uint8_t full[24*17];\
2045     uint8_t halfH[272];\
2046     uint8_t halfV[256];\
2047     uint8_t halfHV[256];\
2048     copy_block17(full, src, 24, stride, 17);\
2049     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2050     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2051     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2052     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2053 }\
2054 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2055     uint8_t full[24*17];\
2056     uint8_t halfH[272];\
2057     uint8_t halfHV[256];\
2058     copy_block17(full, src, 24, stride, 17);\
2059     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2060     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2061     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2062     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2063 }\
2064 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2065     uint8_t halfH[272];\
2066     uint8_t halfHV[256];\
2067     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2068     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2069     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2070 }\
2071 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2072     uint8_t halfH[272];\
2073     uint8_t halfHV[256];\
2074     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2075     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2076     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2077 }\
2078 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2079     uint8_t full[24*17];\
2080     uint8_t halfH[272];\
2081     uint8_t halfV[256];\
2082     uint8_t halfHV[256];\
2083     copy_block17(full, src, 24, stride, 17);\
2084     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2085     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2086     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2087     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2088 }\
2089 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2090     uint8_t full[24*17];\
2091     uint8_t halfH[272];\
2092     copy_block17(full, src, 24, stride, 17);\
2093     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2094     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2095     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2096 }\
2097 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2098     uint8_t full[24*17];\
2099     uint8_t halfH[272];\
2100     uint8_t halfV[256];\
2101     uint8_t halfHV[256];\
2102     copy_block17(full, src, 24, stride, 17);\
2103     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2104     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2105     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2106     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2107 }\
2108 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2109     uint8_t full[24*17];\
2110     uint8_t halfH[272];\
2111     copy_block17(full, src, 24, stride, 17);\
2112     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2113     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2114     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2115 }\
2116 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2117     uint8_t halfH[272];\
2118     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2119     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2120 }
2121
2122 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2123 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2124 #define op_put(a, b) a = cm[((b) + 16)>>5]
2125 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2126
2127 QPEL_MC(0, put_       , _       , op_put)
2128 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2129 QPEL_MC(0, avg_       , _       , op_avg)
2130 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2131 #undef op_avg
2132 #undef op_avg_no_rnd
2133 #undef op_put
2134 #undef op_put_no_rnd
2135
2136 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
2137 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
2138 #define put_qpel16_mc00_c ff_put_pixels16x16_c
2139 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2140 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
2141 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2142
2143 #if 1
2144 #define H264_LOWPASS(OPNAME, OP, OP2) \
2145 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2146     const int h=2;\
2147     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2148     int i;\
2149     for(i=0; i<h; i++)\
2150     {\
2151         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2152         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2153         dst+=dstStride;\
2154         src+=srcStride;\
2155     }\
2156 }\
2157 \
2158 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2159     const int w=2;\
2160     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2161     int i;\
2162     for(i=0; i<w; i++)\
2163     {\
2164         const int srcB= src[-2*srcStride];\
2165         const int srcA= src[-1*srcStride];\
2166         const int src0= src[0 *srcStride];\
2167         const int src1= src[1 *srcStride];\
2168         const int src2= src[2 *srcStride];\
2169         const int src3= src[3 *srcStride];\
2170         const int src4= src[4 *srcStride];\
2171         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2172         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2173         dst++;\
2174         src++;\
2175     }\
2176 }\
2177 \
2178 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2179     const int h=2;\
2180     const int w=2;\
2181     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2182     int i;\
2183     src -= 2*srcStride;\
2184     for(i=0; i<h+5; i++)\
2185     {\
2186         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2187         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2188         tmp+=tmpStride;\
2189         src+=srcStride;\
2190     }\
2191     tmp -= tmpStride*(h+5-2);\
2192     for(i=0; i<w; i++)\
2193     {\
2194         const int tmpB= tmp[-2*tmpStride];\
2195         const int tmpA= tmp[-1*tmpStride];\
2196         const int tmp0= tmp[0 *tmpStride];\
2197         const int tmp1= tmp[1 *tmpStride];\
2198         const int tmp2= tmp[2 *tmpStride];\
2199         const int tmp3= tmp[3 *tmpStride];\
2200         const int tmp4= tmp[4 *tmpStride];\
2201         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2202         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2203         dst++;\
2204         tmp++;\
2205     }\
2206 }\
2207 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2208     const int h=4;\
2209     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2210     int i;\
2211     for(i=0; i<h; i++)\
2212     {\
2213         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2214         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2215         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2216         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2217         dst+=dstStride;\
2218         src+=srcStride;\
2219     }\
2220 }\
2221 \
2222 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2223     const int w=4;\
2224     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2225     int i;\
2226     for(i=0; i<w; i++)\
2227     {\
2228         const int srcB= src[-2*srcStride];\
2229         const int srcA= src[-1*srcStride];\
2230         const int src0= src[0 *srcStride];\
2231         const int src1= src[1 *srcStride];\
2232         const int src2= src[2 *srcStride];\
2233         const int src3= src[3 *srcStride];\
2234         const int src4= src[4 *srcStride];\
2235         const int src5= src[5 *srcStride];\
2236         const int src6= src[6 *srcStride];\
2237         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2238         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2239         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2240         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2241         dst++;\
2242         src++;\
2243     }\
2244 }\
2245 \
2246 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2247     const int h=4;\
2248     const int w=4;\
2249     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2250     int i;\
2251     src -= 2*srcStride;\
2252     for(i=0; i<h+5; i++)\
2253     {\
2254         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2255         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2256         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2257         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2258         tmp+=tmpStride;\
2259         src+=srcStride;\
2260     }\
2261     tmp -= tmpStride*(h+5-2);\
2262     for(i=0; i<w; i++)\
2263     {\
2264         const int tmpB= tmp[-2*tmpStride];\
2265         const int tmpA= tmp[-1*tmpStride];\
2266         const int tmp0= tmp[0 *tmpStride];\
2267         const int tmp1= tmp[1 *tmpStride];\
2268         const int tmp2= tmp[2 *tmpStride];\
2269         const int tmp3= tmp[3 *tmpStride];\
2270         const int tmp4= tmp[4 *tmpStride];\
2271         const int tmp5= tmp[5 *tmpStride];\
2272         const int tmp6= tmp[6 *tmpStride];\
2273         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2274         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2275         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2276         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2277         dst++;\
2278         tmp++;\
2279     }\
2280 }\
2281 \
2282 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2283     const int h=8;\
2284     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2285     int i;\
2286     for(i=0; i<h; i++)\
2287     {\
2288         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2289         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2290         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2291         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2292         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2293         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2294         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2295         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2296         dst+=dstStride;\
2297         src+=srcStride;\
2298     }\
2299 }\
2300 \
2301 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2302     const int w=8;\
2303     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2304     int i;\
2305     for(i=0; i<w; i++)\
2306     {\
2307         const int srcB= src[-2*srcStride];\
2308         const int srcA= src[-1*srcStride];\
2309         const int src0= src[0 *srcStride];\
2310         const int src1= src[1 *srcStride];\
2311         const int src2= src[2 *srcStride];\
2312         const int src3= src[3 *srcStride];\
2313         const int src4= src[4 *srcStride];\
2314         const int src5= src[5 *srcStride];\
2315         const int src6= src[6 *srcStride];\
2316         const int src7= src[7 *srcStride];\
2317         const int src8= src[8 *srcStride];\
2318         const int src9= src[9 *srcStride];\
2319         const int src10=src[10*srcStride];\
2320         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2321         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2322         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2323         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2324         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2325         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2326         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2327         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2328         dst++;\
2329         src++;\
2330     }\
2331 }\
2332 \
2333 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2334     const int h=8;\
2335     const int w=8;\
2336     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2337     int i;\
2338     src -= 2*srcStride;\
2339     for(i=0; i<h+5; i++)\
2340     {\
2341         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2342         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2343         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2344         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2345         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2346         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2347         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2348         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2349         tmp+=tmpStride;\
2350         src+=srcStride;\
2351     }\
2352     tmp -= tmpStride*(h+5-2);\
2353     for(i=0; i<w; i++)\
2354     {\
2355         const int tmpB= tmp[-2*tmpStride];\
2356         const int tmpA= tmp[-1*tmpStride];\
2357         const int tmp0= tmp[0 *tmpStride];\
2358         const int tmp1= tmp[1 *tmpStride];\
2359         const int tmp2= tmp[2 *tmpStride];\
2360         const int tmp3= tmp[3 *tmpStride];\
2361         const int tmp4= tmp[4 *tmpStride];\
2362         const int tmp5= tmp[5 *tmpStride];\
2363         const int tmp6= tmp[6 *tmpStride];\
2364         const int tmp7= tmp[7 *tmpStride];\
2365         const int tmp8= tmp[8 *tmpStride];\
2366         const int tmp9= tmp[9 *tmpStride];\
2367         const int tmp10=tmp[10*tmpStride];\
2368         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2369         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2370         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2371         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2372         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2373         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2374         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2375         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2376         dst++;\
2377         tmp++;\
2378     }\
2379 }\
2380 \
2381 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2382     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2383     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2384     src += 8*srcStride;\
2385     dst += 8*dstStride;\
2386     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2387     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2388 }\
2389 \
2390 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2391     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2392     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2393     src += 8*srcStride;\
2394     dst += 8*dstStride;\
2395     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2396     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2397 }\
2398 \
2399 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2400     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2401     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2402     src += 8*srcStride;\
2403     dst += 8*dstStride;\
2404     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2405     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2406 }\
2407
2408 #define H264_MC(OPNAME, SIZE) \
2409 static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2410     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2411 }\
2412 \
2413 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2414     uint8_t half[SIZE*SIZE];\
2415     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2416     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2417 }\
2418 \
2419 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2420     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2421 }\
2422 \
2423 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2424     uint8_t half[SIZE*SIZE];\
2425     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2426     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2427 }\
2428 \
2429 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2430     uint8_t full[SIZE*(SIZE+5)];\
2431     uint8_t * const full_mid= full + SIZE*2;\
2432     uint8_t half[SIZE*SIZE];\
2433     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2434     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2435     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2436 }\
2437 \
2438 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2439     uint8_t full[SIZE*(SIZE+5)];\
2440     uint8_t * const full_mid= full + SIZE*2;\
2441     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2442     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2443 }\
2444 \
2445 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2446     uint8_t full[SIZE*(SIZE+5)];\
2447     uint8_t * const full_mid= full + SIZE*2;\
2448     uint8_t half[SIZE*SIZE];\
2449     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2450     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2451     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2452 }\
2453 \
2454 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2455     uint8_t full[SIZE*(SIZE+5)];\
2456     uint8_t * const full_mid= full + SIZE*2;\
2457     uint8_t halfH[SIZE*SIZE];\
2458     uint8_t halfV[SIZE*SIZE];\
2459     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2460     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2461     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2462     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2463 }\
2464 \
2465 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2466     uint8_t full[SIZE*(SIZE+5)];\
2467     uint8_t * const full_mid= full + SIZE*2;\
2468     uint8_t halfH[SIZE*SIZE];\
2469     uint8_t halfV[SIZE*SIZE];\
2470     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2471     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2472     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2473     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2474 }\
2475 \
2476 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2477     uint8_t full[SIZE*(SIZE+5)];\
2478     uint8_t * const full_mid= full + SIZE*2;\
2479     uint8_t halfH[SIZE*SIZE];\
2480     uint8_t halfV[SIZE*SIZE];\
2481     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2482     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2483     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2484     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2485 }\
2486 \
2487 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2488     uint8_t full[SIZE*(SIZE+5)];\
2489     uint8_t * const full_mid= full + SIZE*2;\
2490     uint8_t halfH[SIZE*SIZE];\
2491     uint8_t halfV[SIZE*SIZE];\
2492     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2493     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2494     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2495     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2496 }\
2497 \
2498 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2499     int16_t tmp[SIZE*(SIZE+5)];\
2500     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2501 }\
2502 \
2503 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2504     int16_t tmp[SIZE*(SIZE+5)];\
2505     uint8_t halfH[SIZE*SIZE];\
2506     uint8_t halfHV[SIZE*SIZE];\
2507     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2508     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2509     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2510 }\
2511 \
2512 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2513     int16_t tmp[SIZE*(SIZE+5)];\
2514     uint8_t halfH[SIZE*SIZE];\
2515     uint8_t halfHV[SIZE*SIZE];\
2516     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2517     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2518     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2519 }\
2520 \
2521 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2522     uint8_t full[SIZE*(SIZE+5)];\
2523     uint8_t * const full_mid= full + SIZE*2;\
2524     int16_t tmp[SIZE*(SIZE+5)];\
2525     uint8_t halfV[SIZE*SIZE];\
2526     uint8_t halfHV[SIZE*SIZE];\
2527     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2528     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2529     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2530     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2531 }\
2532 \
2533 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2534     uint8_t full[SIZE*(SIZE+5)];\
2535     uint8_t * const full_mid= full + SIZE*2;\
2536     int16_t tmp[SIZE*(SIZE+5)];\
2537     uint8_t halfV[SIZE*SIZE];\
2538     uint8_t halfHV[SIZE*SIZE];\
2539     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2540     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2541     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2542     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2543 }\
2544
2545 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2546 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2547 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2548 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2549 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2550
2551 H264_LOWPASS(put_       , op_put, op2_put)
2552 H264_LOWPASS(avg_       , op_avg, op2_avg)
2553 H264_MC(put_, 2)
2554 H264_MC(put_, 4)
2555 H264_MC(put_, 8)
2556 H264_MC(put_, 16)
2557 H264_MC(avg_, 4)
2558 H264_MC(avg_, 8)
2559 H264_MC(avg_, 16)
2560
2561 #undef op_avg
2562 #undef op_put
2563 #undef op2_avg
2564 #undef op2_put
2565 #endif
2566
2567 #define put_h264_qpel8_mc00_c  ff_put_pixels8x8_c
2568 #define avg_h264_qpel8_mc00_c  ff_avg_pixels8x8_c
2569 #define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2570 #define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2571
2572 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2573     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2574     int i;
2575
2576     for(i=0; i<h; i++){
2577         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2578         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2579         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2580         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2581         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2582         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2583         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2584         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2585         dst+=dstStride;
2586         src+=srcStride;
2587     }
2588 }
2589
2590 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2591     put_pixels8_c(dst, src, stride, 8);
2592 }
2593 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2594     avg_pixels8_c(dst, src, stride, 8);
2595 }
2596 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2597     put_pixels16_c(dst, src, stride, 16);
2598 }
2599 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2600     avg_pixels16_c(dst, src, stride, 16);
2601 }
2602
2603 #if CONFIG_RV40_DECODER
2604 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2605     put_pixels16_xy2_c(dst, src, stride, 16);
2606 }
2607 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2608     avg_pixels16_xy2_c(dst, src, stride, 16);
2609 }
2610 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2611     put_pixels8_xy2_c(dst, src, stride, 8);
2612 }
2613 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2614     avg_pixels8_xy2_c(dst, src, stride, 8);
2615 }
2616 #endif /* CONFIG_RV40_DECODER */
2617
2618 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2619     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2620     int i;
2621
2622     for(i=0; i<w; i++){
2623         const int src_1= src[ -srcStride];
2624         const int src0 = src[0          ];
2625         const int src1 = src[  srcStride];
2626         const int src2 = src[2*srcStride];
2627         const int src3 = src[3*srcStride];
2628         const int src4 = src[4*srcStride];
2629         const int src5 = src[5*srcStride];
2630         const int src6 = src[6*srcStride];
2631         const int src7 = src[7*srcStride];
2632         const int src8 = src[8*srcStride];
2633         const int src9 = src[9*srcStride];
2634         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2635         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2636         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2637         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2638         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2639         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2640         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2641         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2642         src++;
2643         dst++;
2644     }
2645 }
2646
2647 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2648     uint8_t half[64];
2649     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2650     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2651 }
2652
2653 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2654     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2655 }
2656
2657 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2658     uint8_t half[64];
2659     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2660     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2661 }
2662
2663 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2664     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2665 }
2666
2667 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2668     uint8_t halfH[88];
2669     uint8_t halfV[64];
2670     uint8_t halfHV[64];
2671     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2672     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2673     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2674     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2675 }
2676 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2677     uint8_t halfH[88];
2678     uint8_t halfV[64];
2679     uint8_t halfHV[64];
2680     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2681     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2682     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2683     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2684 }
2685 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2686     uint8_t halfH[88];
2687     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2688     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2689 }
2690
2691 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2692     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2693     int x;
2694     const int strength= ff_h263_loop_filter_strength[qscale];
2695
2696     for(x=0; x<8; x++){
2697         int d1, d2, ad1;
2698         int p0= src[x-2*stride];
2699         int p1= src[x-1*stride];
2700         int p2= src[x+0*stride];
2701         int p3= src[x+1*stride];
2702         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2703
2704         if     (d<-2*strength) d1= 0;
2705         else if(d<-  strength) d1=-2*strength - d;
2706         else if(d<   strength) d1= d;
2707         else if(d< 2*strength) d1= 2*strength - d;
2708         else                   d1= 0;
2709
2710         p1 += d1;
2711         p2 -= d1;
2712         if(p1&256) p1= ~(p1>>31);
2713         if(p2&256) p2= ~(p2>>31);
2714
2715         src[x-1*stride] = p1;
2716         src[x+0*stride] = p2;
2717
2718         ad1= FFABS(d1)>>1;
2719
2720         d2= av_clip((p0-p3)/4, -ad1, ad1);
2721
2722         src[x-2*stride] = p0 - d2;
2723         src[x+  stride] = p3 + d2;
2724     }
2725     }
2726 }
2727
2728 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2729     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2730     int y;
2731     const int strength= ff_h263_loop_filter_strength[qscale];
2732
2733     for(y=0; y<8; y++){
2734         int d1, d2, ad1;
2735         int p0= src[y*stride-2];
2736         int p1= src[y*stride-1];
2737         int p2= src[y*stride+0];
2738         int p3= src[y*stride+1];
2739         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2740
2741         if     (d<-2*strength) d1= 0;
2742         else if(d<-  strength) d1=-2*strength - d;
2743         else if(d<   strength) d1= d;
2744         else if(d< 2*strength) d1= 2*strength - d;
2745         else                   d1= 0;
2746
2747         p1 += d1;
2748         p2 -= d1;
2749         if(p1&256) p1= ~(p1>>31);
2750         if(p2&256) p2= ~(p2>>31);
2751
2752         src[y*stride-1] = p1;
2753         src[y*stride+0] = p2;
2754
2755         ad1= FFABS(d1)>>1;
2756
2757         d2= av_clip((p0-p3)/4, -ad1, ad1);
2758
2759         src[y*stride-2] = p0 - d2;
2760         src[y*stride+1] = p3 + d2;
2761     }
2762     }
2763 }
2764
2765 static void h261_loop_filter_c(uint8_t *src, int stride){
2766     int x,y,xy,yz;
2767     int temp[64];
2768
2769     for(x=0; x<8; x++){
2770         temp[x      ] = 4*src[x           ];
2771         temp[x + 7*8] = 4*src[x + 7*stride];
2772     }
2773     for(y=1; y<7; y++){
2774         for(x=0; x<8; x++){
2775             xy = y * stride + x;
2776             yz = y * 8 + x;
2777             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2778         }
2779     }
2780
2781     for(y=0; y<8; y++){
2782         src[  y*stride] = (temp[  y*8] + 2)>>2;
2783         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2784         for(x=1; x<7; x++){
2785             xy = y * stride + x;
2786             yz = y * 8 + x;
2787             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2788         }
2789     }
2790 }
2791
2792 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2793 {
2794     int s, i;
2795
2796     s = 0;
2797     for(i=0;i<h;i++) {
2798         s += abs(pix1[0] - pix2[0]);
2799         s += abs(pix1[1] - pix2[1]);
2800         s += abs(pix1[2] - pix2[2]);
2801         s += abs(pix1[3] - pix2[3]);
2802         s += abs(pix1[4] - pix2[4]);
2803         s += abs(pix1[5] - pix2[5]);
2804         s += abs(pix1[6] - pix2[6]);
2805         s += abs(pix1[7] - pix2[7]);
2806         s += abs(pix1[8] - pix2[8]);
2807         s += abs(pix1[9] - pix2[9]);
2808         s += abs(pix1[10] - pix2[10]);
2809         s += abs(pix1[11] - pix2[11]);
2810         s += abs(pix1[12] - pix2[12]);
2811         s += abs(pix1[13] - pix2[13]);
2812         s += abs(pix1[14] - pix2[14]);
2813         s += abs(pix1[15] - pix2[15]);
2814         pix1 += line_size;
2815         pix2 += line_size;
2816     }
2817     return s;
2818 }
2819
2820 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2821 {
2822     int s, i;
2823
2824     s = 0;
2825     for(i=0;i<h;i++) {
2826         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2827         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2828         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2829         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2830         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2831         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2832         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2833         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2834         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2835         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2836         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2837         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2838         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2839         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2840         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2841         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2842         pix1 += line_size;
2843         pix2 += line_size;
2844     }
2845     return s;
2846 }
2847
2848 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2849 {
2850     int s, i;
2851     uint8_t *pix3 = pix2 + line_size;
2852
2853     s = 0;
2854     for(i=0;i<h;i++) {
2855         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2856         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2857         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2858         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2859         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2860         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2861         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2862         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2863         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2864         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2865         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2866         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2867         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2868         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2869         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2870         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2871         pix1 += line_size;
2872         pix2 += line_size;
2873         pix3 += line_size;
2874     }
2875     return s;
2876 }
2877
2878 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2879 {
2880     int s, i;
2881     uint8_t *pix3 = pix2 + line_size;
2882
2883     s = 0;
2884     for(i=0;i<h;i++) {
2885         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2886         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2887         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2888         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2889         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2890         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2891         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2892         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2893         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2894         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2895         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2896         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2897         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2898         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2899         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2900         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2901         pix1 += line_size;
2902         pix2 += line_size;
2903         pix3 += line_size;
2904     }
2905     return s;
2906 }
2907
2908 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2909 {
2910     int s, i;
2911
2912     s = 0;
2913     for(i=0;i<h;i++) {
2914         s += abs(pix1[0] - pix2[0]);
2915         s += abs(pix1[1] - pix2[1]);
2916         s += abs(pix1[2] - pix2[2]);
2917         s += abs(pix1[3] - pix2[3]);
2918         s += abs(pix1[4] - pix2[4]);
2919         s += abs(pix1[5] - pix2[5]);
2920         s += abs(pix1[6] - pix2[6]);
2921         s += abs(pix1[7] - pix2[7]);
2922         pix1 += line_size;
2923         pix2 += line_size;
2924     }
2925     return s;
2926 }
2927
2928 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2929 {
2930     int s, i;
2931
2932     s = 0;
2933     for(i=0;i<h;i++) {
2934         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2935         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2936         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2937         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2938         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2939         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2940         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2941         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2942         pix1 += line_size;
2943         pix2 += line_size;
2944     }
2945     return s;
2946 }
2947
2948 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2949 {
2950     int s, i;
2951     uint8_t *pix3 = pix2 + line_size;
2952
2953     s = 0;
2954     for(i=0;i<h;i++) {
2955         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2956         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2957         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2958         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2959         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2960         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2961         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2962         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2963         pix1 += line_size;
2964         pix2 += line_size;
2965         pix3 += line_size;
2966     }
2967     return s;
2968 }
2969
2970 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2971 {
2972     int s, i;
2973     uint8_t *pix3 = pix2 + line_size;
2974
2975     s = 0;
2976     for(i=0;i<h;i++) {
2977         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2978         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2979         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2980         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2981         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2982         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2983         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2984         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2985         pix1 += line_size;
2986         pix2 += line_size;
2987         pix3 += line_size;
2988     }
2989     return s;
2990 }
2991
2992 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2993     MpegEncContext *c = v;
2994     int score1=0;
2995     int score2=0;
2996     int x,y;
2997
2998     for(y=0; y<h; y++){
2999         for(x=0; x<16; x++){
3000             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3001         }
3002         if(y+1<h){
3003             for(x=0; x<15; x++){
3004                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3005                              - s1[x+1] + s1[x+1+stride])
3006                         -FFABS(  s2[x  ] - s2[x  +stride]
3007                              - s2[x+1] + s2[x+1+stride]);
3008             }
3009         }
3010         s1+= stride;
3011         s2+= stride;
3012     }
3013
3014     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3015     else  return score1 + FFABS(score2)*8;
3016 }
3017
3018 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3019     MpegEncContext *c = v;
3020     int score1=0;
3021     int score2=0;
3022     int x,y;
3023
3024     for(y=0; y<h; y++){
3025         for(x=0; x<8; x++){
3026             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3027         }
3028         if(y+1<h){
3029             for(x=0; x<7; x++){
3030                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3031                              - s1[x+1] + s1[x+1+stride])
3032                         -FFABS(  s2[x  ] - s2[x  +stride]
3033                              - s2[x+1] + s2[x+1+stride]);
3034             }
3035         }
3036         s1+= stride;
3037         s2+= stride;
3038     }
3039
3040     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3041     else  return score1 + FFABS(score2)*8;
3042 }
3043
3044 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3045     int i;
3046     unsigned int sum=0;
3047
3048     for(i=0; i<8*8; i++){
3049         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3050         int w= weight[i];
3051         b>>= RECON_SHIFT;
3052         assert(-512<b && b<512);
3053
3054         sum += (w*b)*(w*b)>>4;
3055     }
3056     return sum>>2;
3057 }
3058
3059 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3060     int i;
3061
3062     for(i=0; i<8*8; i++){
3063         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3064     }
3065 }
3066
3067 /**
3068  * permutes an 8x8 block.
3069  * @param block the block which will be permuted according to the given permutation vector
3070  * @param permutation the permutation vector
3071  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3072  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3073  *                  (inverse) permutated to scantable order!
3074  */
3075 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3076 {
3077     int i;
3078     DCTELEM temp[64];
3079
3080     if(last<=0) return;
3081     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3082
3083     for(i=0; i<=last; i++){
3084         const int j= scantable[i];
3085         temp[j]= block[j];
3086         block[j]=0;
3087     }
3088
3089     for(i=0; i<=last; i++){
3090         const int j= scantable[i];
3091         const int perm_j= permutation[j];
3092         block[perm_j]= temp[j];
3093     }
3094 }
3095
3096 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3097     return 0;
3098 }
3099
3100 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3101     int i;
3102
3103     memset(cmp, 0, sizeof(void*)*6);
3104
3105     for(i=0; i<6; i++){
3106         switch(type&0xFF){
3107         case FF_CMP_SAD:
3108             cmp[i]= c->sad[i];
3109             break;
3110         case FF_CMP_SATD:
3111             cmp[i]= c->hadamard8_diff[i];
3112             break;
3113         case FF_CMP_SSE:
3114             cmp[i]= c->sse[i];
3115             break;
3116         case FF_CMP_DCT:
3117             cmp[i]= c->dct_sad[i];
3118             break;
3119         case FF_CMP_DCT264:
3120             cmp[i]= c->dct264_sad[i];
3121             break;
3122         case FF_CMP_DCTMAX:
3123             cmp[i]= c->dct_max[i];
3124             break;
3125         case FF_CMP_PSNR:
3126             cmp[i]= c->quant_psnr[i];
3127             break;
3128         case FF_CMP_BIT:
3129             cmp[i]= c->bit[i];
3130             break;
3131         case FF_CMP_RD:
3132             cmp[i]= c->rd[i];
3133             break;
3134         case FF_CMP_VSAD:
3135             cmp[i]= c->vsad[i];
3136             break;
3137         case FF_CMP_VSSE:
3138             cmp[i]= c->vsse[i];
3139             break;
3140         case FF_CMP_ZERO:
3141             cmp[i]= zero_cmp;
3142             break;
3143         case FF_CMP_NSSE:
3144             cmp[i]= c->nsse[i];
3145             break;
3146 #if CONFIG_DWT
3147         case FF_CMP_W53:
3148             cmp[i]= c->w53[i];
3149             break;
3150         case FF_CMP_W97:
3151             cmp[i]= c->w97[i];
3152             break;
3153 #endif
3154         default:
3155             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3156         }
3157     }
3158 }
3159
3160 static void clear_block_c(DCTELEM *block)
3161 {
3162     memset(block, 0, sizeof(DCTELEM)*64);
3163 }
3164
3165 /**
3166  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3167  */
3168 static void clear_blocks_c(DCTELEM *blocks)
3169 {
3170     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3171 }
3172
3173 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3174     long i;
3175     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3176         long a = *(long*)(src+i);
3177         long b = *(long*)(dst+i);
3178         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3179     }
3180     for(; i<w; i++)
3181         dst[i+0] += src[i+0];
3182 }
3183
3184 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3185     long i;
3186     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3187         long a = *(long*)(src1+i);
3188         long b = *(long*)(src2+i);
3189         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3190     }
3191     for(; i<w; i++)
3192         dst[i] = src1[i]+src2[i];
3193 }
3194
3195 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3196     long i;
3197 #if !HAVE_FAST_UNALIGNED
3198     if((long)src2 & (sizeof(long)-1)){
3199         for(i=0; i+7<w; i+=8){
3200             dst[i+0] = src1[i+0]-src2[i+0];
3201             dst[i+1] = src1[i+1]-src2[i+1];
3202             dst[i+2] = src1[i+2]-src2[i+2];
3203             dst[i+3] = src1[i+3]-src2[i+3];
3204             dst[i+4] = src1[i+4]-src2[i+4];
3205             dst[i+5] = src1[i+5]-src2[i+5];
3206             dst[i+6] = src1[i+6]-src2[i+6];
3207             dst[i+7] = src1[i+7]-src2[i+7];
3208         }
3209     }else
3210 #endif
3211     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3212         long a = *(long*)(src1+i);
3213         long b = *(long*)(src2+i);
3214         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3215     }
3216     for(; i<w; i++)
3217         dst[i+0] = src1[i+0]-src2[i+0];
3218 }
3219
3220 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3221     int i;
3222     uint8_t l, lt;
3223
3224     l= *left;
3225     lt= *left_top;
3226
3227     for(i=0; i<w; i++){
3228         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3229         lt= src1[i];
3230         dst[i]= l;
3231     }
3232
3233     *left= l;
3234     *left_top= lt;
3235 }
3236
3237 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3238     int i;
3239     uint8_t l, lt;
3240
3241     l= *left;
3242     lt= *left_top;
3243
3244     for(i=0; i<w; i++){
3245         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3246         lt= src1[i];
3247         l= src2[i];
3248         dst[i]= l - pred;
3249     }
3250
3251     *left= l;
3252     *left_top= lt;
3253 }
3254
3255 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3256     int i;
3257
3258     for(i=0; i<w-1; i++){
3259         acc+= src[i];
3260         dst[i]= acc;
3261         i++;
3262         acc+= src[i];
3263         dst[i]= acc;
3264     }
3265
3266     for(; i<w; i++){
3267         acc+= src[i];
3268         dst[i]= acc;
3269     }
3270
3271     return acc;
3272 }
3273
3274 #if HAVE_BIGENDIAN
3275 #define B 3
3276 #define G 2
3277 #define R 1
3278 #define A 0
3279 #else
3280 #define B 0
3281 #define G 1
3282 #define R 2
3283 #define A 3
3284 #endif
3285 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3286     int i;
3287     int r,g,b,a;
3288     r= *red;
3289     g= *green;
3290     b= *blue;
3291     a= *alpha;
3292
3293     for(i=0; i<w; i++){
3294         b+= src[4*i+B];
3295         g+= src[4*i+G];
3296         r+= src[4*i+R];
3297         a+= src[4*i+A];
3298
3299         dst[4*i+B]= b;
3300         dst[4*i+G]= g;
3301         dst[4*i+R]= r;
3302         dst[4*i+A]= a;
3303     }
3304
3305     *red= r;
3306     *green= g;
3307     *blue= b;
3308     *alpha= a;
3309 }
3310 #undef B
3311 #undef G
3312 #undef R
3313 #undef A
3314
3315 #define BUTTERFLY2(o1,o2,i1,i2) \
3316 o1= (i1)+(i2);\
3317 o2= (i1)-(i2);
3318
3319 #define BUTTERFLY1(x,y) \
3320 {\
3321     int a,b;\
3322     a= x;\
3323     b= y;\
3324     x= a+b;\
3325     y= a-b;\
3326 }
3327
3328 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3329
3330 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3331     int i;
3332     int temp[64];
3333     int sum=0;
3334
3335     assert(h==8);
3336
3337     for(i=0; i<8; i++){
3338         //FIXME try pointer walks
3339         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3340         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3341         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3342         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3343
3344         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3345         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3346         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3347         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3348
3349         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3350         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3351         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3352         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3353     }
3354
3355     for(i=0; i<8; i++){
3356         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3357         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3358         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3359         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3360
3361         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3362         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3363         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3364         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3365
3366         sum +=
3367              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3368             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3369             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3370             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3371     }
3372 #if 0
3373 static int maxi=0;
3374 if(sum>maxi){
3375     maxi=sum;
3376     printf("MAX:%d\n", maxi);
3377 }
3378 #endif
3379     return sum;
3380 }
3381
3382 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3383     int i;
3384     int temp[64];
3385     int sum=0;
3386
3387     assert(h==8);
3388
3389     for(i=0; i<8; i++){
3390         //FIXME try pointer walks
3391         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3392         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3393         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3394         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3395
3396         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3397         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3398         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3399         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3400
3401         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3402         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3403         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3404         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3405     }
3406
3407     for(i=0; i<8; i++){
3408         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3409         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3410         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3411         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3412
3413         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3414         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3415         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3416         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3417
3418         sum +=
3419              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3420             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3421             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3422             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3423     }
3424
3425     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3426
3427     return sum;
3428 }
3429
3430 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3431     MpegEncContext * const s= (MpegEncContext *)c;
3432     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3433
3434     assert(h==8);
3435
3436     s->dsp.diff_pixels(temp, src1, src2, stride);
3437     s->dsp.fdct(temp);
3438     return s->dsp.sum_abs_dctelem(temp);
3439 }
3440
3441 #if CONFIG_GPL
3442 #define DCT8_1D {\
3443     const int s07 = SRC(0) + SRC(7);\
3444     const int s16 = SRC(1) + SRC(6);\
3445     const int s25 = SRC(2) + SRC(5);\
3446     const int s34 = SRC(3) + SRC(4);\
3447     const int a0 = s07 + s34;\
3448     const int a1 = s16 + s25;\
3449     const int a2 = s07 - s34;\
3450     const int a3 = s16 - s25;\
3451     const int d07 = SRC(0) - SRC(7);\
3452     const int d16 = SRC(1) - SRC(6);\
3453     const int d25 = SRC(2) - SRC(5);\
3454     const int d34 = SRC(3) - SRC(4);\
3455     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3456     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3457     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3458     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3459     DST(0,  a0 + a1     ) ;\
3460     DST(1,  a4 + (a7>>2)) ;\
3461     DST(2,  a2 + (a3>>1)) ;\
3462     DST(3,  a5 + (a6>>2)) ;\
3463     DST(4,  a0 - a1     ) ;\
3464     DST(5,  a6 - (a5>>2)) ;\
3465     DST(6, (a2>>1) - a3 ) ;\
3466     DST(7, (a4>>2) - a7 ) ;\
3467 }
3468
3469 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3470     MpegEncContext * const s= (MpegEncContext *)c;
3471     DCTELEM dct[8][8];
3472     int i;
3473     int sum=0;
3474
3475     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3476
3477 #define SRC(x) dct[i][x]
3478 #define DST(x,v) dct[i][x]= v
3479     for( i = 0; i < 8; i++ )
3480         DCT8_1D
3481 #undef SRC
3482 #undef DST
3483
3484 #define SRC(x) dct[x][i]
3485 #define DST(x,v) sum += FFABS(v)
3486     for( i = 0; i < 8; i++ )
3487         DCT8_1D
3488 #undef SRC
3489 #undef DST
3490     return sum;
3491 }
3492 #endif
3493
3494 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3495     MpegEncContext * const s= (MpegEncContext *)c;
3496     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3497     int sum=0, i;
3498
3499     assert(h==8);
3500
3501     s->dsp.diff_pixels(temp, src1, src2, stride);
3502     s->dsp.fdct(temp);
3503
3504     for(i=0; i<64; i++)
3505         sum= FFMAX(sum, FFABS(temp[i]));
3506
3507     return sum;
3508 }
3509
3510 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3511     MpegEncContext * const s= (MpegEncContext *)c;
3512     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3513     DCTELEM * const bak = temp+64;
3514     int sum=0, i;
3515
3516     assert(h==8);
3517     s->mb_intra=0;
3518
3519     s->dsp.diff_pixels(temp, src1, src2, stride);
3520
3521     memcpy(bak, temp, 64*sizeof(DCTELEM));
3522
3523     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3524     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3525     ff_simple_idct(temp); //FIXME
3526
3527     for(i=0; i<64; i++)
3528         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3529
3530     return sum;
3531 }
3532
3533 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3534     MpegEncContext * const s= (MpegEncContext *)c;
3535     const uint8_t *scantable= s->intra_scantable.permutated;
3536     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3537     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3538     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3539     int i, last, run, bits, level, distortion, start_i;
3540     const int esc_length= s->ac_esc_length;
3541     uint8_t * length;
3542     uint8_t * last_length;
3543
3544     assert(h==8);
3545
3546     copy_block8(lsrc1, src1, 8, stride, 8);
3547     copy_block8(lsrc2, src2, 8, stride, 8);
3548
3549     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3550
3551     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3552
3553     bits=0;
3554
3555     if (s->mb_intra) {
3556         start_i = 1;
3557         length     = s->intra_ac_vlc_length;
3558         last_length= s->intra_ac_vlc_last_length;
3559         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3560     } else {
3561         start_i = 0;
3562         length     = s->inter_ac_vlc_length;
3563         last_length= s->inter_ac_vlc_last_length;
3564     }
3565
3566     if(last>=start_i){
3567         run=0;
3568         for(i=start_i; i<last; i++){
3569             int j= scantable[i];
3570             level= temp[j];
3571
3572             if(level){
3573                 level+=64;
3574                 if((level&(~127)) == 0){
3575                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3576                 }else
3577                     bits+= esc_length;
3578                 run=0;
3579             }else
3580                 run++;
3581         }
3582         i= scantable[last];
3583
3584         level= temp[i] + 64;
3585
3586         assert(level - 64);
3587
3588         if((level&(~127)) == 0){
3589             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3590         }else
3591             bits+= esc_length;
3592
3593     }
3594
3595     if(last>=0){
3596         if(s->mb_intra)
3597             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3598         else
3599             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3600     }
3601
3602     s->dsp.idct_add(lsrc2, 8, temp);
3603
3604     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3605
3606     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3607 }
3608
3609 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3610     MpegEncContext * const s= (MpegEncContext *)c;
3611     const uint8_t *scantable= s->intra_scantable.permutated;
3612     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3613     int i, last, run, bits, level, start_i;
3614     const int esc_length= s->ac_esc_length;
3615     uint8_t * length;
3616     uint8_t * last_length;
3617
3618     assert(h==8);
3619
3620     s->dsp.diff_pixels(temp, src1, src2, stride);
3621
3622     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3623
3624     bits=0;
3625
3626     if (s->mb_intra) {
3627         start_i = 1;
3628         length     = s->intra_ac_vlc_length;
3629         last_length= s->intra_ac_vlc_last_length;
3630         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3631     } else {
3632         start_i = 0;
3633         length     = s->inter_ac_vlc_length;
3634         last_length= s->inter_ac_vlc_last_length;
3635     }
3636
3637     if(last>=start_i){
3638         run=0;
3639         for(i=start_i; i<last; i++){
3640             int j= scantable[i];
3641             level= temp[j];
3642
3643             if(level){
3644                 level+=64;
3645                 if((level&(~127)) == 0){
3646                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3647                 }else
3648                     bits+= esc_length;
3649                 run=0;
3650             }else
3651                 run++;
3652         }
3653         i= scantable[last];
3654
3655         level= temp[i] + 64;
3656
3657         assert(level - 64);
3658
3659         if((level&(~127)) == 0){
3660             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3661         }else
3662             bits+= esc_length;
3663     }
3664
3665     return bits;
3666 }
3667
3668 #define VSAD_INTRA(size) \
3669 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3670     int score=0;                                                                                            \
3671     int x,y;                                                                                                \
3672                                                                                                             \
3673     for(y=1; y<h; y++){                                                                                     \
3674         for(x=0; x<size; x+=4){                                                                             \
3675             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3676                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3677         }                                                                                                   \
3678         s+= stride;                                                                                         \
3679     }                                                                                                       \
3680                                                                                                             \
3681     return score;                                                                                           \
3682 }
3683 VSAD_INTRA(8)
3684 VSAD_INTRA(16)
3685
3686 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3687     int score=0;
3688     int x,y;
3689
3690     for(y=1; y<h; y++){
3691         for(x=0; x<16; x++){
3692             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3693         }
3694         s1+= stride;
3695         s2+= stride;
3696     }
3697
3698     return score;
3699 }
3700
3701 #define SQ(a) ((a)*(a))
3702 #define VSSE_INTRA(size) \
3703 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3704     int score=0;                                                                                            \
3705     int x,y;                                                                                                \
3706                                                                                                             \
3707     for(y=1; y<h; y++){                                                                                     \
3708         for(x=0; x<size; x+=4){                                                                               \
3709             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3710                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3711         }                                                                                                   \
3712         s+= stride;                                                                                         \
3713     }                                                                                                       \
3714                                                                                                             \
3715     return score;                                                                                           \
3716 }
3717 VSSE_INTRA(8)
3718 VSSE_INTRA(16)
3719
3720 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3721     int score=0;
3722     int x,y;
3723
3724     for(y=1; y<h; y++){
3725         for(x=0; x<16; x++){
3726             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3727         }
3728         s1+= stride;
3729         s2+= stride;
3730     }
3731
3732     return score;
3733 }
3734
3735 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3736                                int size){
3737     int score=0;
3738     int i;
3739     for(i=0; i<size; i++)
3740         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3741     return score;
3742 }
3743
3744 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3745 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3746 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3747 #if CONFIG_GPL
3748 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3749 #endif
3750 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3751 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3752 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3753 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3754
3755 static void vector_fmul_c(float *dst, const float *src, int len){
3756     int i;
3757     for(i=0; i<len; i++)
3758         dst[i] *= src[i];
3759 }
3760
3761 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3762     int i;
3763     src1 += len-1;
3764     for(i=0; i<len; i++)
3765         dst[i] = src0[i] * src1[-i];
3766 }
3767
3768 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3769     int i;
3770     for(i=0; i<len; i++)
3771         dst[i] = src0[i] * src1[i] + src2[i];
3772 }
3773
3774 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3775     int i,j;
3776     dst += len;
3777     win += len;
3778     src0+= len;
3779     for(i=-len, j=len-1; i<0; i++, j--) {
3780         float s0 = src0[i];
3781         float s1 = src1[j];
3782         float wi = win[i];
3783         float wj = win[j];
3784         dst[i] = s0*wj - s1*wi + add_bias;
3785         dst[j] = s0*wi + s1*wj + add_bias;
3786     }
3787 }
3788
3789 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3790                                  int len)
3791 {
3792     int i;
3793     for (i = 0; i < len; i++)
3794         dst[i] = src[i] * mul;
3795 }
3796
3797 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3798                                       const float **sv, float mul, int len)
3799 {
3800     int i;
3801     for (i = 0; i < len; i += 2, sv++) {
3802         dst[i  ] = src[i  ] * sv[0][0] * mul;
3803         dst[i+1] = src[i+1] * sv[0][1] * mul;
3804     }
3805 }
3806
3807 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3808                                       const float **sv, float mul, int len)
3809 {
3810     int i;
3811     for (i = 0; i < len; i += 4, sv++) {
3812         dst[i  ] = src[i  ] * sv[0][0] * mul;
3813         dst[i+1] = src[i+1] * sv[0][1] * mul;
3814         dst[i+2] = src[i+2] * sv[0][2] * mul;
3815         dst[i+3] = src[i+3] * sv[0][3] * mul;
3816     }
3817 }
3818
3819 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3820                                int len)
3821 {
3822     int i;
3823     for (i = 0; i < len; i += 2, sv++) {
3824         dst[i  ] = sv[0][0] * mul;
3825         dst[i+1] = sv[0][1] * mul;
3826     }
3827 }
3828
3829 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3830                                int len)
3831 {
3832     int i;
3833     for (i = 0; i < len; i += 4, sv++) {
3834         dst[i  ] = sv[0][0] * mul;
3835         dst[i+1] = sv[0][1] * mul;
3836         dst[i+2] = sv[0][2] * mul;
3837         dst[i+3] = sv[0][3] * mul;
3838     }
3839 }
3840
3841 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3842                                 int len)
3843 {
3844     int i;
3845     for (i = 0; i < len; i++) {
3846         float t = v1[i] - v2[i];
3847         v1[i] += v2[i];
3848         v2[i] = t;
3849     }
3850 }
3851
3852 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3853 {
3854     float p = 0.0;
3855     int i;
3856
3857     for (i = 0; i < len; i++)
3858         p += v1[i] * v2[i];
3859
3860     return p;
3861 }
3862
3863 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3864     int i;
3865     for(i=0; i<len; i++)
3866         dst[i] = src[i] * mul;
3867 }
3868
3869 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3870                    uint32_t maxi, uint32_t maxisign)
3871 {
3872
3873     if(a > mini) return mini;
3874     else if((a^(1<<31)) > maxisign) return maxi;
3875     else return a;
3876 }
3877
3878 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3879     int i;
3880     uint32_t mini = *(uint32_t*)min;
3881     uint32_t maxi = *(uint32_t*)max;
3882     uint32_t maxisign = maxi ^ (1<<31);
3883     uint32_t *dsti = (uint32_t*)dst;
3884     const uint32_t *srci = (const uint32_t*)src;
3885     for(i=0; i<len; i+=8) {
3886         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3887         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3888         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3889         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3890         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3891         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3892         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3893         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3894     }
3895 }
3896 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3897     int i;
3898     if(min < 0 && max > 0) {
3899         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3900     } else {
3901         for(i=0; i < len; i+=8) {
3902             dst[i    ] = av_clipf(src[i    ], min, max);
3903             dst[i + 1] = av_clipf(src[i + 1], min, max);
3904             dst[i + 2] = av_clipf(src[i + 2], min, max);
3905             dst[i + 3] = av_clipf(src[i + 3], min, max);
3906             dst[i + 4] = av_clipf(src[i + 4], min, max);
3907             dst[i + 5] = av_clipf(src[i + 5], min, max);
3908             dst[i + 6] = av_clipf(src[i + 6], min, max);
3909             dst[i + 7] = av_clipf(src[i + 7], min, max);
3910         }
3911     }
3912 }
3913
3914 static av_always_inline int float_to_int16_one(const float *src){
3915     int_fast32_t tmp = *(const int32_t*)src;
3916     if(tmp & 0xf0000){
3917         tmp = (0x43c0ffff - tmp)>>31;
3918         // is this faster on some gcc/cpu combinations?
3919 //      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3920 //      else                 tmp = 0;
3921     }
3922     return tmp - 0x8000;
3923 }
3924
3925 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3926     int i;
3927     for(i=0; i<len; i++)
3928         dst[i] = float_to_int16_one(src+i);
3929 }
3930
3931 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3932     int i,j,c;
3933     if(channels==2){
3934         for(i=0; i<len; i++){
3935             dst[2*i]   = float_to_int16_one(src[0]+i);
3936             dst[2*i+1] = float_to_int16_one(src[1]+i);
3937         }
3938     }else{
3939         for(c=0; c<channels; c++)
3940             for(i=0, j=c; i<len; i++, j+=channels)
3941                 dst[j] = float_to_int16_one(src[c]+i);
3942     }
3943 }
3944
3945 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3946 {
3947     int res = 0;
3948
3949     while (order--)
3950         res += (*v1++ * *v2++) >> shift;
3951
3952     return res;
3953 }
3954
3955 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3956 {
3957     int res = 0;
3958     while (order--) {
3959         res   += *v1 * *v2++;
3960         *v1++ += mul * *v3++;
3961     }
3962     return res;
3963 }
3964
3965 #define W0 2048
3966 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3967 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3968 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3969 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3970 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3971 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3972 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3973
3974 static void wmv2_idct_row(short * b)
3975 {
3976     int s1,s2;
3977     int a0,a1,a2,a3,a4,a5,a6,a7;
3978     /*step 1*/
3979     a1 = W1*b[1]+W7*b[7];
3980     a7 = W7*b[1]-W1*b[7];
3981     a5 = W5*b[5]+W3*b[3];
3982     a3 = W3*b[5]-W5*b[3];
3983     a2 = W2*b[2]+W6*b[6];
3984     a6 = W6*b[2]-W2*b[6];
3985     a0 = W0*b[0]+W0*b[4];
3986     a4 = W0*b[0]-W0*b[4];
3987     /*step 2*/
3988     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3989     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3990     /*step 3*/
3991     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3992     b[1] = (a4+a6 +s1   + (1<<7))>>8;
3993     b[2] = (a4-a6 +s2   + (1<<7))>>8;
3994     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3995     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3996     b[5] = (a4-a6 -s2   + (1<<7))>>8;
3997     b[6] = (a4+a6 -s1   + (1<<7))>>8;
3998     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3999 }
4000 static void wmv2_idct_col(short * b)
4001 {
4002     int s1,s2;
4003     int a0,a1,a2,a3,a4,a5,a6,a7;
4004     /*step 1, with extended precision*/
4005     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4006     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4007     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4008     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4009     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4010     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4011     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4012     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4013     /*step 2*/
4014     s1 = (181*(a1-a5+a7-a3)+128)>>8;
4015     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4016     /*step 3*/
4017     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4018     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4019     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4020     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4021
4022     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4023     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4024     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4025     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4026 }
4027 void ff_wmv2_idct_c(short * block){
4028     int i;
4029
4030     for(i=0;i<64;i+=8){
4031         wmv2_idct_row(block+i);
4032     }
4033     for(i=0;i<8;i++){
4034         wmv2_idct_col(block+i);
4035     }
4036 }
4037 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4038  converted */
4039 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4040 {
4041     ff_wmv2_idct_c(block);
4042     put_pixels_clamped_c(block, dest, line_size);
4043 }
4044 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4045 {
4046     ff_wmv2_idct_c(block);
4047     add_pixels_clamped_c(block, dest, line_size);
4048 }
4049 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4050 {
4051     j_rev_dct (block);
4052     put_pixels_clamped_c(block, dest, line_size);
4053 }
4054 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4055 {
4056     j_rev_dct (block);
4057     add_pixels_clamped_c(block, dest, line_size);
4058 }
4059
4060 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4061 {
4062     j_rev_dct4 (block);
4063     put_pixels_clamped4_c(block, dest, line_size);
4064 }
4065 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4066 {
4067     j_rev_dct4 (block);
4068     add_pixels_clamped4_c(block, dest, line_size);
4069 }
4070
4071 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4072 {
4073     j_rev_dct2 (block);
4074     put_pixels_clamped2_c(block, dest, line_size);
4075 }
4076 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4077 {
4078     j_rev_dct2 (block);
4079     add_pixels_clamped2_c(block, dest, line_size);
4080 }
4081
4082 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4083 {
4084     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4085
4086     dest[0] = cm[(block[0] + 4)>>3];
4087 }
4088 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4089 {
4090     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4091
4092     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4093 }
4094
4095 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4096
4097 /* init static data */
4098 av_cold void dsputil_static_init(void)
4099 {
4100     int i;
4101
4102     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4103     for(i=0;i<MAX_NEG_CROP;i++) {
4104         ff_cropTbl[i] = 0;
4105         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4106     }
4107
4108     for(i=0;i<512;i++) {
4109         ff_squareTbl[i] = (i - 256) * (i - 256);
4110     }
4111
4112     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4113 }
4114
4115 int ff_check_alignment(void){
4116     static int did_fail=0;
4117     DECLARE_ALIGNED(16, int, aligned);
4118
4119     if((intptr_t)&aligned & 15){
4120         if(!did_fail){
4121 #if HAVE_MMX || HAVE_ALTIVEC
4122             av_log(NULL, AV_LOG_ERROR,
4123                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4124                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4125                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4126                 "Do not report crashes to FFmpeg developers.\n");
4127 #endif
4128             did_fail=1;
4129         }
4130         return -1;
4131     }
4132     return 0;
4133 }
4134
4135 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4136 {
4137     int i;
4138
4139     ff_check_alignment();
4140
4141 #if CONFIG_ENCODERS
4142     if(avctx->dct_algo==FF_DCT_FASTINT) {
4143         c->fdct = fdct_ifast;
4144         c->fdct248 = fdct_ifast248;
4145     }
4146     else if(avctx->dct_algo==FF_DCT_FAAN) {
4147         c->fdct = ff_faandct;
4148         c->fdct248 = ff_faandct248;
4149     }
4150     else {
4151         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4152         c->fdct248 = ff_fdct248_islow;
4153     }
4154 #endif //CONFIG_ENCODERS
4155
4156     if(avctx->lowres==1){
4157         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4158             c->idct_put= ff_jref_idct4_put;
4159             c->idct_add= ff_jref_idct4_add;
4160         }else{
4161             c->idct_put= ff_h264_lowres_idct_put_c;
4162             c->idct_add= ff_h264_lowres_idct_add_c;
4163         }
4164         c->idct    = j_rev_dct4;
4165         c->idct_permutation_type= FF_NO_IDCT_PERM;
4166     }else if(avctx->lowres==2){
4167         c->idct_put= ff_jref_idct2_put;
4168         c->idct_add= ff_jref_idct2_add;
4169         c->idct    = j_rev_dct2;
4170         c->idct_permutation_type= FF_NO_IDCT_PERM;
4171     }else if(avctx->lowres==3){
4172         c->idct_put= ff_jref_idct1_put;
4173         c->idct_add= ff_jref_idct1_add;
4174         c->idct    = j_rev_dct1;
4175         c->idct_permutation_type= FF_NO_IDCT_PERM;
4176     }else{
4177         if(avctx->idct_algo==FF_IDCT_INT){
4178             c->idct_put= ff_jref_idct_put;
4179             c->idct_add= ff_jref_idct_add;
4180             c->idct    = j_rev_dct;
4181             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4182         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4183                 avctx->idct_algo==FF_IDCT_VP3){
4184             c->idct_put= ff_vp3_idct_put_c;
4185             c->idct_add= ff_vp3_idct_add_c;
4186             c->idct    = ff_vp3_idct_c;
4187             c->idct_permutation_type= FF_NO_IDCT_PERM;
4188         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4189             c->idct_put= ff_wmv2_idct_put_c;
4190             c->idct_add= ff_wmv2_idct_add_c;
4191             c->idct    = ff_wmv2_idct_c;
4192             c->idct_permutation_type= FF_NO_IDCT_PERM;
4193         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4194             c->idct_put= ff_faanidct_put;
4195             c->idct_add= ff_faanidct_add;
4196             c->idct    = ff_faanidct;
4197             c->idct_permutation_type= FF_NO_IDCT_PERM;
4198         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4199             c->idct_put= ff_ea_idct_put_c;
4200             c->idct_permutation_type= FF_NO_IDCT_PERM;
4201         }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4202             c->idct     = ff_bink_idct_c;
4203             c->idct_add = ff_bink_idct_add_c;
4204             c->idct_put = ff_bink_idct_put_c;
4205             c->idct_permutation_type = FF_NO_IDCT_PERM;
4206         }else{ //accurate/default
4207             c->idct_put= ff_simple_idct_put;
4208             c->idct_add= ff_simple_idct_add;
4209             c->idct    = ff_simple_idct;
4210             c->idct_permutation_type= FF_NO_IDCT_PERM;
4211         }
4212     }
4213
4214     c->get_pixels = get_pixels_c;
4215     c->diff_pixels = diff_pixels_c;
4216     c->put_pixels_clamped = put_pixels_clamped_c;
4217     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4218     c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4219     c->add_pixels_clamped = add_pixels_clamped_c;
4220     c->add_pixels8 = add_pixels8_c;
4221     c->add_pixels4 = add_pixels4_c;
4222     c->sum_abs_dctelem = sum_abs_dctelem_c;
4223     c->gmc1 = gmc1_c;
4224     c->gmc = ff_gmc_c;
4225     c->clear_block = clear_block_c;
4226     c->clear_blocks = clear_blocks_c;
4227     c->pix_sum = pix_sum_c;
4228     c->pix_norm1 = pix_norm1_c;
4229
4230     c->fill_block_tab[0] = fill_block16_c;
4231     c->fill_block_tab[1] = fill_block8_c;
4232     c->scale_block = scale_block_c;
4233
4234     /* TODO [0] 16  [1] 8 */
4235     c->pix_abs[0][0] = pix_abs16_c;
4236     c->pix_abs[0][1] = pix_abs16_x2_c;
4237     c->pix_abs[0][2] = pix_abs16_y2_c;
4238     c->pix_abs[0][3] = pix_abs16_xy2_c;
4239     c->pix_abs[1][0] = pix_abs8_c;
4240     c->pix_abs[1][1] = pix_abs8_x2_c;
4241     c->pix_abs[1][2] = pix_abs8_y2_c;
4242     c->pix_abs[1][3] = pix_abs8_xy2_c;
4243
4244 #define dspfunc(PFX, IDX, NUM) \
4245     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4246     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4247     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4248     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4249
4250     dspfunc(put, 0, 16);
4251     dspfunc(put_no_rnd, 0, 16);
4252     dspfunc(put, 1, 8);
4253     dspfunc(put_no_rnd, 1, 8);
4254     dspfunc(put, 2, 4);
4255     dspfunc(put, 3, 2);
4256
4257     dspfunc(avg, 0, 16);
4258     dspfunc(avg_no_rnd, 0, 16);
4259     dspfunc(avg, 1, 8);
4260     dspfunc(avg_no_rnd, 1, 8);
4261     dspfunc(avg, 2, 4);
4262     dspfunc(avg, 3, 2);
4263 #undef dspfunc
4264
4265     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4266     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4267
4268     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4269     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4270     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4271     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4272     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4273     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4274     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4275     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4276     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4277
4278     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4279     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4280     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4281     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4282     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4283     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4284     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4285     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4286     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4287
4288 #define dspfunc(PFX, IDX, NUM) \
4289     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4290     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4291     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4292     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4293     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4294     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4295     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4296     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4297     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4298     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4299     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4300     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4301     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4302     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4303     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4304     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4305
4306     dspfunc(put_qpel, 0, 16);
4307     dspfunc(put_no_rnd_qpel, 0, 16);
4308
4309     dspfunc(avg_qpel, 0, 16);
4310     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4311
4312     dspfunc(put_qpel, 1, 8);
4313     dspfunc(put_no_rnd_qpel, 1, 8);
4314
4315     dspfunc(avg_qpel, 1, 8);
4316     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4317
4318     dspfunc(put_h264_qpel, 0, 16);
4319     dspfunc(put_h264_qpel, 1, 8);
4320     dspfunc(put_h264_qpel, 2, 4);
4321     dspfunc(put_h264_qpel, 3, 2);
4322     dspfunc(avg_h264_qpel, 0, 16);
4323     dspfunc(avg_h264_qpel, 1, 8);
4324     dspfunc(avg_h264_qpel, 2, 4);
4325
4326 #undef dspfunc
4327     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4328     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4329     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4330     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4331     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4332     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4333     c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4334     c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4335
4336     c->draw_edges = draw_edges_c;
4337
4338 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4339     ff_mlp_init(c, avctx);
4340 #endif
4341 #if CONFIG_VC1_DECODER
4342     ff_vc1dsp_init(c,avctx);
4343 #endif
4344 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4345     ff_intrax8dsp_init(c,avctx);
4346 #endif
4347 #if CONFIG_RV30_DECODER
4348     ff_rv30dsp_init(c,avctx);
4349 #endif
4350 #if CONFIG_RV40_DECODER
4351     ff_rv40dsp_init(c,avctx);
4352     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4353     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4354     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4355     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4356 #endif
4357
4358     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4359     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4360     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4361     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4362     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4363     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4364     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4365     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4366
4367 #define SET_CMP_FUNC(name) \
4368     c->name[0]= name ## 16_c;\
4369     c->name[1]= name ## 8x8_c;
4370
4371     SET_CMP_FUNC(hadamard8_diff)
4372     c->hadamard8_diff[4]= hadamard8_intra16_c;
4373     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4374     SET_CMP_FUNC(dct_sad)
4375     SET_CMP_FUNC(dct_max)
4376 #if CONFIG_GPL
4377     SET_CMP_FUNC(dct264_sad)
4378 #endif
4379     c->sad[0]= pix_abs16_c;
4380     c->sad[1]= pix_abs8_c;
4381     c->sse[0]= sse16_c;
4382     c->sse[1]= sse8_c;
4383     c->sse[2]= sse4_c;
4384     SET_CMP_FUNC(quant_psnr)
4385     SET_CMP_FUNC(rd)
4386     SET_CMP_FUNC(bit)
4387     c->vsad[0]= vsad16_c;
4388     c->vsad[4]= vsad_intra16_c;
4389     c->vsad[5]= vsad_intra8_c;
4390     c->vsse[0]= vsse16_c;
4391     c->vsse[4]= vsse_intra16_c;
4392     c->vsse[5]= vsse_intra8_c;
4393     c->nsse[0]= nsse16_c;
4394     c->nsse[1]= nsse8_c;
4395 #if CONFIG_DWT
4396     ff_dsputil_init_dwt(c);
4397 #endif
4398
4399     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4400
4401     c->add_bytes= add_bytes_c;
4402     c->add_bytes_l2= add_bytes_l2_c;
4403     c->diff_bytes= diff_bytes_c;
4404     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4405     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4406     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
4407     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4408     c->bswap_buf= bswap_buf;
4409 #if CONFIG_PNG_DECODER
4410     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4411 #endif
4412
4413     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4414         c->h263_h_loop_filter= h263_h_loop_filter_c;
4415         c->h263_v_loop_filter= h263_v_loop_filter_c;
4416     }
4417
4418     if (CONFIG_VP3_DECODER) {
4419         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4420         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4421         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4422     }
4423
4424     c->h261_loop_filter= h261_loop_filter_c;
4425
4426     c->try_8x8basis= try_8x8basis_c;
4427     c->add_8x8basis= add_8x8basis_c;
4428
4429 #if CONFIG_VORBIS_DECODER
4430     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4431 #endif
4432 #if CONFIG_AC3_DECODER
4433     c->ac3_downmix = ff_ac3_downmix_c;
4434 #endif
4435 #if CONFIG_LPC
4436     c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
4437 #endif
4438     c->vector_fmul = vector_fmul_c;
4439     c->vector_fmul_reverse = vector_fmul_reverse_c;
4440     c->vector_fmul_add = vector_fmul_add_c;
4441     c->vector_fmul_window = ff_vector_fmul_window_c;
4442     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4443     c->vector_clipf = vector_clipf_c;
4444     c->float_to_int16 = ff_float_to_int16_c;
4445     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4446     c->scalarproduct_int16 = scalarproduct_int16_c;
4447     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4448     c->scalarproduct_float = scalarproduct_float_c;
4449     c->butterflies_float = butterflies_float_c;
4450     c->vector_fmul_scalar = vector_fmul_scalar_c;
4451
4452     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4453     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4454
4455     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4456     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4457
4458     c->shrink[0]= av_image_copy_plane;
4459     c->shrink[1]= ff_shrink22;
4460     c->shrink[2]= ff_shrink44;
4461     c->shrink[3]= ff_shrink88;
4462
4463     c->prefetch= just_return;
4464
4465     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4466     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4467
4468     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4469     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4470     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4471     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4472     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4473     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4474     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4475     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4476     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4477
4478     for(i=0; i<64; i++){
4479         if(!c->put_2tap_qpel_pixels_tab[0][i])
4480             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4481         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4482             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4483     }
4484
4485     c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4486     c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4487     c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4488     c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4489
4490     c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4491     c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4492     c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4493     c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4494
4495     switch(c->idct_permutation_type){
4496     case FF_NO_IDCT_PERM:
4497         for(i=0; i<64; i++)
4498             c->idct_permutation[i]= i;
4499         break;
4500     case FF_LIBMPEG2_IDCT_PERM:
4501         for(i=0; i<64; i++)
4502             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4503         break;
4504     case FF_SIMPLE_IDCT_PERM:
4505         for(i=0; i<64; i++)
4506             c->idct_permutation[i]= simple_mmx_permutation[i];
4507         break;
4508     case FF_TRANSPOSE_IDCT_PERM:
4509         for(i=0; i<64; i++)
4510             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4511         break;
4512     case FF_PARTTRANS_IDCT_PERM:
4513         for(i=0; i<64; i++)
4514             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4515         break;
4516     case FF_SSE2_IDCT_PERM:
4517         for(i=0; i<64; i++)
4518             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4519         break;
4520     default:
4521         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4522     }
4523 }
4524