git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "simple_idct.h"
  33 #include "faandct.h"
  34 #include "faanidct.h"
  35 #include "mathops.h"
  36 #include "mpegvideo.h"
  37 #include "config.h"
  38 #include "lpc.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41 #include "png.h"
  42 #include "vp8dsp.h"
  43
  44 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  45 uint32_t ff_squareTbl[512] = {0, };
  46
  47 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  48 #define pb_7f (~0UL/255 * 0x7f)
  49 #define pb_80 (~0UL/255 * 0x80)
  50
  51 const uint8_t ff_zigzag_direct[64] = {
  52     0,   1,  8, 16,  9,  2,  3, 10,
  53     17, 24, 32, 25, 18, 11,  4,  5,
  54     12, 19, 26, 33, 40, 48, 41, 34,
  55     27, 20, 13,  6,  7, 14, 21, 28,
  56     35, 42, 49, 56, 57, 50, 43, 36,
  57     29, 22, 15, 23, 30, 37, 44, 51,
  58     58, 59, 52, 45, 38, 31, 39, 46,
  59     53, 60, 61, 54, 47, 55, 62, 63
  60 };
  61
  62 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  63    specification, we interleave the fields */
  64 const uint8_t ff_zigzag248_direct[64] = {
  65      0,  8,  1,  9, 16, 24,  2, 10,
  66     17, 25, 32, 40, 48, 56, 33, 41,
  67     18, 26,  3, 11,  4, 12, 19, 27,
  68     34, 42, 49, 57, 50, 58, 35, 43,
  69     20, 28,  5, 13,  6, 14, 21, 29,
  70     36, 44, 51, 59, 52, 60, 37, 45,
  71     22, 30,  7, 15, 23, 31, 38, 46,
  72     53, 61, 54, 62, 39, 47, 55, 63,
  73 };
  74
  75 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  76 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  77
  78 const uint8_t ff_alternate_horizontal_scan[64] = {
  79     0,  1,   2,  3,  8,  9, 16, 17,
  80     10, 11,  4,  5,  6,  7, 15, 14,
  81     13, 12, 19, 18, 24, 25, 32, 33,
  82     26, 27, 20, 21, 22, 23, 28, 29,
  83     30, 31, 34, 35, 40, 41, 48, 49,
  84     42, 43, 36, 37, 38, 39, 44, 45,
  85     46, 47, 50, 51, 56, 57, 58, 59,
  86     52, 53, 54, 55, 60, 61, 62, 63,
  87 };
  88
  89 const uint8_t ff_alternate_vertical_scan[64] = {
  90     0,  8,  16, 24,  1,  9,  2, 10,
  91     17, 25, 32, 40, 48, 56, 57, 49,
  92     41, 33, 26, 18,  3, 11,  4, 12,
  93     19, 27, 34, 42, 50, 58, 35, 43,
  94     51, 59, 20, 28,  5, 13,  6, 14,
  95     21, 29, 36, 44, 52, 60, 37, 45,
  96     53, 61, 22, 30,  7, 15, 23, 31,
  97     38, 46, 54, 62, 39, 47, 55, 63,
  98 };
  99
 100 /* Input permutation for the simple_idct_mmx */
 101 static const uint8_t simple_mmx_permutation[64]={
 102         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 103         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 104         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 105         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 106         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 107         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 108         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 109         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 110 };
 111
 112 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 113
 114 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 115     int i;
 116     int end;
 117
 118     st->scantable= src_scantable;
 119
 120     for(i=0; i<64; i++){
 121         int j;
 122         j = src_scantable[i];
 123         st->permutated[i] = permutation[j];
 124 #if ARCH_PPC
 125         st->inverse[j] = i;
 126 #endif
 127     }
 128
 129     end=-1;
 130     for(i=0; i<64; i++){
 131         int j;
 132         j = st->permutated[i];
 133         if(j>end) end=j;
 134         st->raster_end[i]= end;
 135     }
 136 }
 137
 138 static int pix_sum_c(uint8_t * pix, int line_size)
 139 {
 140     int s, i, j;
 141
 142     s = 0;
 143     for (i = 0; i < 16; i++) {
 144         for (j = 0; j < 16; j += 8) {
 145             s += pix[0];
 146             s += pix[1];
 147             s += pix[2];
 148             s += pix[3];
 149             s += pix[4];
 150             s += pix[5];
 151             s += pix[6];
 152             s += pix[7];
 153             pix += 8;
 154         }
 155         pix += line_size - 16;
 156     }
 157     return s;
 158 }
 159
 160 static int pix_norm1_c(uint8_t * pix, int line_size)
 161 {
 162     int s, i, j;
 163     uint32_t *sq = ff_squareTbl + 256;
 164
 165     s = 0;
 166     for (i = 0; i < 16; i++) {
 167         for (j = 0; j < 16; j += 8) {
 168 #if 0
 169             s += sq[pix[0]];
 170             s += sq[pix[1]];
 171             s += sq[pix[2]];
 172             s += sq[pix[3]];
 173             s += sq[pix[4]];
 174             s += sq[pix[5]];
 175             s += sq[pix[6]];
 176             s += sq[pix[7]];
 177 #else
 178 #if LONG_MAX > 2147483647
 179             register uint64_t x=*(uint64_t*)pix;
 180             s += sq[x&0xff];
 181             s += sq[(x>>8)&0xff];
 182             s += sq[(x>>16)&0xff];
 183             s += sq[(x>>24)&0xff];
 184             s += sq[(x>>32)&0xff];
 185             s += sq[(x>>40)&0xff];
 186             s += sq[(x>>48)&0xff];
 187             s += sq[(x>>56)&0xff];
 188 #else
 189             register uint32_t x=*(uint32_t*)pix;
 190             s += sq[x&0xff];
 191             s += sq[(x>>8)&0xff];
 192             s += sq[(x>>16)&0xff];
 193             s += sq[(x>>24)&0xff];
 194             x=*(uint32_t*)(pix+4);
 195             s += sq[x&0xff];
 196             s += sq[(x>>8)&0xff];
 197             s += sq[(x>>16)&0xff];
 198             s += sq[(x>>24)&0xff];
 199 #endif
 200 #endif
 201             pix += 8;
 202         }
 203         pix += line_size - 16;
 204     }
 205     return s;
 206 }
 207
 208 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 209     int i;
 210
 211     for(i=0; i+8<=w; i+=8){
 212         dst[i+0]= av_bswap32(src[i+0]);
 213         dst[i+1]= av_bswap32(src[i+1]);
 214         dst[i+2]= av_bswap32(src[i+2]);
 215         dst[i+3]= av_bswap32(src[i+3]);
 216         dst[i+4]= av_bswap32(src[i+4]);
 217         dst[i+5]= av_bswap32(src[i+5]);
 218         dst[i+6]= av_bswap32(src[i+6]);
 219         dst[i+7]= av_bswap32(src[i+7]);
 220     }
 221     for(;i<w; i++){
 222         dst[i+0]= av_bswap32(src[i+0]);
 223     }
 224 }
 225
 226 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 227 {
 228     int s, i;
 229     uint32_t *sq = ff_squareTbl + 256;
 230
 231     s = 0;
 232     for (i = 0; i < h; i++) {
 233         s += sq[pix1[0] - pix2[0]];
 234         s += sq[pix1[1] - pix2[1]];
 235         s += sq[pix1[2] - pix2[2]];
 236         s += sq[pix1[3] - pix2[3]];
 237         pix1 += line_size;
 238         pix2 += line_size;
 239     }
 240     return s;
 241 }
 242
 243 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 244 {
 245     int s, i;
 246     uint32_t *sq = ff_squareTbl + 256;
 247
 248     s = 0;
 249     for (i = 0; i < h; i++) {
 250         s += sq[pix1[0] - pix2[0]];
 251         s += sq[pix1[1] - pix2[1]];
 252         s += sq[pix1[2] - pix2[2]];
 253         s += sq[pix1[3] - pix2[3]];
 254         s += sq[pix1[4] - pix2[4]];
 255         s += sq[pix1[5] - pix2[5]];
 256         s += sq[pix1[6] - pix2[6]];
 257         s += sq[pix1[7] - pix2[7]];
 258         pix1 += line_size;
 259         pix2 += line_size;
 260     }
 261     return s;
 262 }
 263
 264 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 265 {
 266     int s, i;
 267     uint32_t *sq = ff_squareTbl + 256;
 268
 269     s = 0;
 270     for (i = 0; i < h; i++) {
 271         s += sq[pix1[ 0] - pix2[ 0]];
 272         s += sq[pix1[ 1] - pix2[ 1]];
 273         s += sq[pix1[ 2] - pix2[ 2]];
 274         s += sq[pix1[ 3] - pix2[ 3]];
 275         s += sq[pix1[ 4] - pix2[ 4]];
 276         s += sq[pix1[ 5] - pix2[ 5]];
 277         s += sq[pix1[ 6] - pix2[ 6]];
 278         s += sq[pix1[ 7] - pix2[ 7]];
 279         s += sq[pix1[ 8] - pix2[ 8]];
 280         s += sq[pix1[ 9] - pix2[ 9]];
 281         s += sq[pix1[10] - pix2[10]];
 282         s += sq[pix1[11] - pix2[11]];
 283         s += sq[pix1[12] - pix2[12]];
 284         s += sq[pix1[13] - pix2[13]];
 285         s += sq[pix1[14] - pix2[14]];
 286         s += sq[pix1[15] - pix2[15]];
 287
 288         pix1 += line_size;
 289         pix2 += line_size;
 290     }
 291     return s;
 292 }
 293
 294 /* draw the edges of width 'w' of an image of size width, height */
 295 //FIXME check that this is ok for mpeg4 interlaced
 296 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 297 {
 298     uint8_t *ptr, *last_line;
 299     int i;
 300
 301     last_line = buf + (height - 1) * wrap;
 302     for(i=0;i<w;i++) {
 303         /* top and bottom */
 304         memcpy(buf - (i + 1) * wrap, buf, width);
 305         memcpy(last_line + (i + 1) * wrap, last_line, width);
 306     }
 307     /* left and right */
 308     ptr = buf;
 309     for(i=0;i<height;i++) {
 310         memset(ptr - w, ptr[0], w);
 311         memset(ptr + width, ptr[width-1], w);
 312         ptr += wrap;
 313     }
 314     /* corners */
 315     for(i=0;i<w;i++) {
 316         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 317         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 318         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 319         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 320     }
 321 }
 322
 323 /**
 324  * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
 325  * @param buf destination buffer
 326  * @param src source buffer
 327  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 328  * @param block_w width of block
 329  * @param block_h height of block
 330  * @param src_x x coordinate of the top left sample of the block in the source buffer
 331  * @param src_y y coordinate of the top left sample of the block in the source buffer
 332  * @param w width of the source buffer
 333  * @param h height of the source buffer
 334  */
 335 void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
 336                                     int src_x, int src_y, int w, int h){
 337     int x, y;
 338     int start_y, start_x, end_y, end_x;
 339
 340     if(src_y>= h){
 341         src+= (h-1-src_y)*linesize;
 342         src_y=h-1;
 343     }else if(src_y<=-block_h){
 344         src+= (1-block_h-src_y)*linesize;
 345         src_y=1-block_h;
 346     }
 347     if(src_x>= w){
 348         src+= (w-1-src_x);
 349         src_x=w-1;
 350     }else if(src_x<=-block_w){
 351         src+= (1-block_w-src_x);
 352         src_x=1-block_w;
 353     }
 354
 355     start_y= FFMAX(0, -src_y);
 356     start_x= FFMAX(0, -src_x);
 357     end_y= FFMIN(block_h, h-src_y);
 358     end_x= FFMIN(block_w, w-src_x);
 359
 360     // copy existing part
 361     for(y=start_y; y<end_y; y++){
 362         for(x=start_x; x<end_x; x++){
 363             buf[x + y*linesize]= src[x + y*linesize];
 364         }
 365     }
 366
 367     //top
 368     for(y=0; y<start_y; y++){
 369         for(x=start_x; x<end_x; x++){
 370             buf[x + y*linesize]= buf[x + start_y*linesize];
 371         }
 372     }
 373
 374     //bottom
 375     for(y=end_y; y<block_h; y++){
 376         for(x=start_x; x<end_x; x++){
 377             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
 378         }
 379     }
 380
 381     for(y=0; y<block_h; y++){
 382        //left
 383         for(x=0; x<start_x; x++){
 384             buf[x + y*linesize]= buf[start_x + y*linesize];
 385         }
 386
 387        //right
 388         for(x=end_x; x<block_w; x++){
 389             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
 390         }
 391     }
 392 }
 393
 394 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 395 {
 396     int i;
 397
 398     /* read the pixels */
 399     for(i=0;i<8;i++) {
 400         block[0] = pixels[0];
 401         block[1] = pixels[1];
 402         block[2] = pixels[2];
 403         block[3] = pixels[3];
 404         block[4] = pixels[4];
 405         block[5] = pixels[5];
 406         block[6] = pixels[6];
 407         block[7] = pixels[7];
 408         pixels += line_size;
 409         block += 8;
 410     }
 411 }
 412
 413 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 414                           const uint8_t *s2, int stride){
 415     int i;
 416
 417     /* read the pixels */
 418     for(i=0;i<8;i++) {
 419         block[0] = s1[0] - s2[0];
 420         block[1] = s1[1] - s2[1];
 421         block[2] = s1[2] - s2[2];
 422         block[3] = s1[3] - s2[3];
 423         block[4] = s1[4] - s2[4];
 424         block[5] = s1[5] - s2[5];
 425         block[6] = s1[6] - s2[6];
 426         block[7] = s1[7] - s2[7];
 427         s1 += stride;
 428         s2 += stride;
 429         block += 8;
 430     }
 431 }
 432
 433
 434 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 435                                  int line_size)
 436 {
 437     int i;
 438     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 439
 440     /* read the pixels */
 441     for(i=0;i<8;i++) {
 442         pixels[0] = cm[block[0]];
 443         pixels[1] = cm[block[1]];
 444         pixels[2] = cm[block[2]];
 445         pixels[3] = cm[block[3]];
 446         pixels[4] = cm[block[4]];
 447         pixels[5] = cm[block[5]];
 448         pixels[6] = cm[block[6]];
 449         pixels[7] = cm[block[7]];
 450
 451         pixels += line_size;
 452         block += 8;
 453     }
 454 }
 455
 456 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 457                                  int line_size)
 458 {
 459     int i;
 460     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 461
 462     /* read the pixels */
 463     for(i=0;i<4;i++) {
 464         pixels[0] = cm[block[0]];
 465         pixels[1] = cm[block[1]];
 466         pixels[2] = cm[block[2]];
 467         pixels[3] = cm[block[3]];
 468
 469         pixels += line_size;
 470         block += 8;
 471     }
 472 }
 473
 474 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 475                                  int line_size)
 476 {
 477     int i;
 478     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 479
 480     /* read the pixels */
 481     for(i=0;i<2;i++) {
 482         pixels[0] = cm[block[0]];
 483         pixels[1] = cm[block[1]];
 484
 485         pixels += line_size;
 486         block += 8;
 487     }
 488 }
 489
 490 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 491                                         uint8_t *restrict pixels,
 492                                         int line_size)
 493 {
 494     int i, j;
 495
 496     for (i = 0; i < 8; i++) {
 497         for (j = 0; j < 8; j++) {
 498             if (*block < -128)
 499                 *pixels = 0;
 500             else if (*block > 127)
 501                 *pixels = 255;
 502             else
 503                 *pixels = (uint8_t)(*block + 128);
 504             block++;
 505             pixels++;
 506         }
 507         pixels += (line_size - 8);
 508     }
 509 }
 510
 511 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 512                                     int line_size)
 513 {
 514     int i;
 515
 516     /* read the pixels */
 517     for(i=0;i<8;i++) {
 518         pixels[0] = block[0];
 519         pixels[1] = block[1];
 520         pixels[2] = block[2];
 521         pixels[3] = block[3];
 522         pixels[4] = block[4];
 523         pixels[5] = block[5];
 524         pixels[6] = block[6];
 525         pixels[7] = block[7];
 526
 527         pixels += line_size;
 528         block += 8;
 529     }
 530 }
 531
 532 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 533                           int line_size)
 534 {
 535     int i;
 536     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 537
 538     /* read the pixels */
 539     for(i=0;i<8;i++) {
 540         pixels[0] = cm[pixels[0] + block[0]];
 541         pixels[1] = cm[pixels[1] + block[1]];
 542         pixels[2] = cm[pixels[2] + block[2]];
 543         pixels[3] = cm[pixels[3] + block[3]];
 544         pixels[4] = cm[pixels[4] + block[4]];
 545         pixels[5] = cm[pixels[5] + block[5]];
 546         pixels[6] = cm[pixels[6] + block[6]];
 547         pixels[7] = cm[pixels[7] + block[7]];
 548         pixels += line_size;
 549         block += 8;
 550     }
 551 }
 552
 553 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 554                           int line_size)
 555 {
 556     int i;
 557     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 558
 559     /* read the pixels */
 560     for(i=0;i<4;i++) {
 561         pixels[0] = cm[pixels[0] + block[0]];
 562         pixels[1] = cm[pixels[1] + block[1]];
 563         pixels[2] = cm[pixels[2] + block[2]];
 564         pixels[3] = cm[pixels[3] + block[3]];
 565         pixels += line_size;
 566         block += 8;
 567     }
 568 }
 569
 570 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 571                           int line_size)
 572 {
 573     int i;
 574     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 575
 576     /* read the pixels */
 577     for(i=0;i<2;i++) {
 578         pixels[0] = cm[pixels[0] + block[0]];
 579         pixels[1] = cm[pixels[1] + block[1]];
 580         pixels += line_size;
 581         block += 8;
 582     }
 583 }
 584
 585 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 586 {
 587     int i;
 588     for(i=0;i<8;i++) {
 589         pixels[0] += block[0];
 590         pixels[1] += block[1];
 591         pixels[2] += block[2];
 592         pixels[3] += block[3];
 593         pixels[4] += block[4];
 594         pixels[5] += block[5];
 595         pixels[6] += block[6];
 596         pixels[7] += block[7];
 597         pixels += line_size;
 598         block += 8;
 599     }
 600 }
 601
 602 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 603 {
 604     int i;
 605     for(i=0;i<4;i++) {
 606         pixels[0] += block[0];
 607         pixels[1] += block[1];
 608         pixels[2] += block[2];
 609         pixels[3] += block[3];
 610         pixels += line_size;
 611         block += 4;
 612     }
 613 }
 614
 615 static int sum_abs_dctelem_c(DCTELEM *block)
 616 {
 617     int sum=0, i;
 618     for(i=0; i<64; i++)
 619         sum+= FFABS(block[i]);
 620     return sum;
 621 }
 622
 623 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 624 {
 625     int i;
 626
 627     for (i = 0; i < h; i++) {
 628         memset(block, value, 16);
 629         block += line_size;
 630     }
 631 }
 632
 633 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 634 {
 635     int i;
 636
 637     for (i = 0; i < h; i++) {
 638         memset(block, value, 8);
 639         block += line_size;
 640     }
 641 }
 642
 643 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
 644 {
 645     int i, j;
 646     uint16_t *dst1 = (uint16_t *) dst;
 647     uint16_t *dst2 = (uint16_t *)(dst + linesize);
 648
 649     for (j = 0; j < 8; j++) {
 650         for (i = 0; i < 8; i++) {
 651             dst1[i] = dst2[i] = src[i] * 0x0101;
 652         }
 653         src  += 8;
 654         dst1 += linesize;
 655         dst2 += linesize;
 656     }
 657 }
 658
 659 #if 0
 660
 661 #define PIXOP2(OPNAME, OP) \
 662 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 663 {\
 664     int i;\
 665     for(i=0; i<h; i++){\
 666         OP(*((uint64_t*)block), AV_RN64(pixels));\
 667         pixels+=line_size;\
 668         block +=line_size;\
 669     }\
 670 }\
 671 \
 672 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 673 {\
 674     int i;\
 675     for(i=0; i<h; i++){\
 676         const uint64_t a= AV_RN64(pixels  );\
 677         const uint64_t b= AV_RN64(pixels+1);\
 678         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 679         pixels+=line_size;\
 680         block +=line_size;\
 681     }\
 682 }\
 683 \
 684 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 685 {\
 686     int i;\
 687     for(i=0; i<h; i++){\
 688         const uint64_t a= AV_RN64(pixels  );\
 689         const uint64_t b= AV_RN64(pixels+1);\
 690         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 691         pixels+=line_size;\
 692         block +=line_size;\
 693     }\
 694 }\
 695 \
 696 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 697 {\
 698     int i;\
 699     for(i=0; i<h; i++){\
 700         const uint64_t a= AV_RN64(pixels          );\
 701         const uint64_t b= AV_RN64(pixels+line_size);\
 702         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 703         pixels+=line_size;\
 704         block +=line_size;\
 705     }\
 706 }\
 707 \
 708 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 709 {\
 710     int i;\
 711     for(i=0; i<h; i++){\
 712         const uint64_t a= AV_RN64(pixels          );\
 713         const uint64_t b= AV_RN64(pixels+line_size);\
 714         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 715         pixels+=line_size;\
 716         block +=line_size;\
 717     }\
 718 }\
 719 \
 720 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 721 {\
 722         int i;\
 723         const uint64_t a= AV_RN64(pixels  );\
 724         const uint64_t b= AV_RN64(pixels+1);\
 725         uint64_t l0=  (a&0x0303030303030303ULL)\
 726                     + (b&0x0303030303030303ULL)\
 727                     + 0x0202020202020202ULL;\
 728         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 729                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 730         uint64_t l1,h1;\
 731 \
 732         pixels+=line_size;\
 733         for(i=0; i<h; i+=2){\
 734             uint64_t a= AV_RN64(pixels  );\
 735             uint64_t b= AV_RN64(pixels+1);\
 736             l1=  (a&0x0303030303030303ULL)\
 737                + (b&0x0303030303030303ULL);\
 738             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 739               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 740             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 741             pixels+=line_size;\
 742             block +=line_size;\
 743             a= AV_RN64(pixels  );\
 744             b= AV_RN64(pixels+1);\
 745             l0=  (a&0x0303030303030303ULL)\
 746                + (b&0x0303030303030303ULL)\
 747                + 0x0202020202020202ULL;\
 748             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 749               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 750             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 751             pixels+=line_size;\
 752             block +=line_size;\
 753         }\
 754 }\
 755 \
 756 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 757 {\
 758         int i;\
 759         const uint64_t a= AV_RN64(pixels  );\
 760         const uint64_t b= AV_RN64(pixels+1);\
 761         uint64_t l0=  (a&0x0303030303030303ULL)\
 762                     + (b&0x0303030303030303ULL)\
 763                     + 0x0101010101010101ULL;\
 764         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 765                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 766         uint64_t l1,h1;\
 767 \
 768         pixels+=line_size;\
 769         for(i=0; i<h; i+=2){\
 770             uint64_t a= AV_RN64(pixels  );\
 771             uint64_t b= AV_RN64(pixels+1);\
 772             l1=  (a&0x0303030303030303ULL)\
 773                + (b&0x0303030303030303ULL);\
 774             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 775               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 776             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 777             pixels+=line_size;\
 778             block +=line_size;\
 779             a= AV_RN64(pixels  );\
 780             b= AV_RN64(pixels+1);\
 781             l0=  (a&0x0303030303030303ULL)\
 782                + (b&0x0303030303030303ULL)\
 783                + 0x0101010101010101ULL;\
 784             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 785               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 786             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 787             pixels+=line_size;\
 788             block +=line_size;\
 789         }\
 790 }\
 791 \
 792 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 793 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 794 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 795 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 796 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 797 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 798 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 799
 800 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 801 #else // 64 bit variant
 802
 803 #define PIXOP2(OPNAME, OP) \
 804 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 805     int i;\
 806     for(i=0; i<h; i++){\
 807         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 808         pixels+=line_size;\
 809         block +=line_size;\
 810     }\
 811 }\
 812 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 813     int i;\
 814     for(i=0; i<h; i++){\
 815         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 816         pixels+=line_size;\
 817         block +=line_size;\
 818     }\
 819 }\
 820 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 821     int i;\
 822     for(i=0; i<h; i++){\
 823         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 824         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 825         pixels+=line_size;\
 826         block +=line_size;\
 827     }\
 828 }\
 829 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 830     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 831 }\
 832 \
 833 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 834                                                 int src_stride1, int src_stride2, int h){\
 835     int i;\
 836     for(i=0; i<h; i++){\
 837         uint32_t a,b;\
 838         a= AV_RN32(&src1[i*src_stride1  ]);\
 839         b= AV_RN32(&src2[i*src_stride2  ]);\
 840         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 841         a= AV_RN32(&src1[i*src_stride1+4]);\
 842         b= AV_RN32(&src2[i*src_stride2+4]);\
 843         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 844     }\
 845 }\
 846 \
 847 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 848                                                 int src_stride1, int src_stride2, int h){\
 849     int i;\
 850     for(i=0; i<h; i++){\
 851         uint32_t a,b;\
 852         a= AV_RN32(&src1[i*src_stride1  ]);\
 853         b= AV_RN32(&src2[i*src_stride2  ]);\
 854         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 855         a= AV_RN32(&src1[i*src_stride1+4]);\
 856         b= AV_RN32(&src2[i*src_stride2+4]);\
 857         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 858     }\
 859 }\
 860 \
 861 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 862                                                 int src_stride1, int src_stride2, int h){\
 863     int i;\
 864     for(i=0; i<h; i++){\
 865         uint32_t a,b;\
 866         a= AV_RN32(&src1[i*src_stride1  ]);\
 867         b= AV_RN32(&src2[i*src_stride2  ]);\
 868         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 869     }\
 870 }\
 871 \
 872 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 873                                                 int src_stride1, int src_stride2, int h){\
 874     int i;\
 875     for(i=0; i<h; i++){\
 876         uint32_t a,b;\
 877         a= AV_RN16(&src1[i*src_stride1  ]);\
 878         b= AV_RN16(&src2[i*src_stride2  ]);\
 879         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 880     }\
 881 }\
 882 \
 883 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 884                                                 int src_stride1, int src_stride2, int h){\
 885     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 886     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 887 }\
 888 \
 889 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 890                                                 int src_stride1, int src_stride2, int h){\
 891     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 892     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 893 }\
 894 \
 895 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 896     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 897 }\
 898 \
 899 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 900     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 901 }\
 902 \
 903 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 904     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 905 }\
 906 \
 907 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 908     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 909 }\
 910 \
 911 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 912                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 913     int i;\
 914     for(i=0; i<h; i++){\
 915         uint32_t a, b, c, d, l0, l1, h0, h1;\
 916         a= AV_RN32(&src1[i*src_stride1]);\
 917         b= AV_RN32(&src2[i*src_stride2]);\
 918         c= AV_RN32(&src3[i*src_stride3]);\
 919         d= AV_RN32(&src4[i*src_stride4]);\
 920         l0=  (a&0x03030303UL)\
 921            + (b&0x03030303UL)\
 922            + 0x02020202UL;\
 923         h0= ((a&0xFCFCFCFCUL)>>2)\
 924           + ((b&0xFCFCFCFCUL)>>2);\
 925         l1=  (c&0x03030303UL)\
 926            + (d&0x03030303UL);\
 927         h1= ((c&0xFCFCFCFCUL)>>2)\
 928           + ((d&0xFCFCFCFCUL)>>2);\
 929         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 930         a= AV_RN32(&src1[i*src_stride1+4]);\
 931         b= AV_RN32(&src2[i*src_stride2+4]);\
 932         c= AV_RN32(&src3[i*src_stride3+4]);\
 933         d= AV_RN32(&src4[i*src_stride4+4]);\
 934         l0=  (a&0x03030303UL)\
 935            + (b&0x03030303UL)\
 936            + 0x02020202UL;\
 937         h0= ((a&0xFCFCFCFCUL)>>2)\
 938           + ((b&0xFCFCFCFCUL)>>2);\
 939         l1=  (c&0x03030303UL)\
 940            + (d&0x03030303UL);\
 941         h1= ((c&0xFCFCFCFCUL)>>2)\
 942           + ((d&0xFCFCFCFCUL)>>2);\
 943         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 944     }\
 945 }\
 946 \
 947 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 948     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 949 }\
 950 \
 951 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 952     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 953 }\
 954 \
 955 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 956     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 957 }\
 958 \
 959 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 960     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 961 }\
 962 \
 963 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 964                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 965     int i;\
 966     for(i=0; i<h; i++){\
 967         uint32_t a, b, c, d, l0, l1, h0, h1;\
 968         a= AV_RN32(&src1[i*src_stride1]);\
 969         b= AV_RN32(&src2[i*src_stride2]);\
 970         c= AV_RN32(&src3[i*src_stride3]);\
 971         d= AV_RN32(&src4[i*src_stride4]);\
 972         l0=  (a&0x03030303UL)\
 973            + (b&0x03030303UL)\
 974            + 0x01010101UL;\
 975         h0= ((a&0xFCFCFCFCUL)>>2)\
 976           + ((b&0xFCFCFCFCUL)>>2);\
 977         l1=  (c&0x03030303UL)\
 978            + (d&0x03030303UL);\
 979         h1= ((c&0xFCFCFCFCUL)>>2)\
 980           + ((d&0xFCFCFCFCUL)>>2);\
 981         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 982         a= AV_RN32(&src1[i*src_stride1+4]);\
 983         b= AV_RN32(&src2[i*src_stride2+4]);\
 984         c= AV_RN32(&src3[i*src_stride3+4]);\
 985         d= AV_RN32(&src4[i*src_stride4+4]);\
 986         l0=  (a&0x03030303UL)\
 987            + (b&0x03030303UL)\
 988            + 0x01010101UL;\
 989         h0= ((a&0xFCFCFCFCUL)>>2)\
 990           + ((b&0xFCFCFCFCUL)>>2);\
 991         l1=  (c&0x03030303UL)\
 992            + (d&0x03030303UL);\
 993         h1= ((c&0xFCFCFCFCUL)>>2)\
 994           + ((d&0xFCFCFCFCUL)>>2);\
 995         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 996     }\
 997 }\
 998 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 999                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1000     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1001     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1002 }\
1003 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1004                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1005     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1006     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1007 }\
1008 \
1009 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1010 {\
1011         int i, a0, b0, a1, b1;\
1012         a0= pixels[0];\
1013         b0= pixels[1] + 2;\
1014         a0 += b0;\
1015         b0 += pixels[2];\
1016 \
1017         pixels+=line_size;\
1018         for(i=0; i<h; i+=2){\
1019             a1= pixels[0];\
1020             b1= pixels[1];\
1021             a1 += b1;\
1022             b1 += pixels[2];\
1023 \
1024             block[0]= (a1+a0)>>2; /* FIXME non put */\
1025             block[1]= (b1+b0)>>2;\
1026 \
1027             pixels+=line_size;\
1028             block +=line_size;\
1029 \
1030             a0= pixels[0];\
1031             b0= pixels[1] + 2;\
1032             a0 += b0;\
1033             b0 += pixels[2];\
1034 \
1035             block[0]= (a1+a0)>>2;\
1036             block[1]= (b1+b0)>>2;\
1037             pixels+=line_size;\
1038             block +=line_size;\
1039         }\
1040 }\
1041 \
1042 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1043 {\
1044         int i;\
1045         const uint32_t a= AV_RN32(pixels  );\
1046         const uint32_t b= AV_RN32(pixels+1);\
1047         uint32_t l0=  (a&0x03030303UL)\
1048                     + (b&0x03030303UL)\
1049                     + 0x02020202UL;\
1050         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1051                    + ((b&0xFCFCFCFCUL)>>2);\
1052         uint32_t l1,h1;\
1053 \
1054         pixels+=line_size;\
1055         for(i=0; i<h; i+=2){\
1056             uint32_t a= AV_RN32(pixels  );\
1057             uint32_t b= AV_RN32(pixels+1);\
1058             l1=  (a&0x03030303UL)\
1059                + (b&0x03030303UL);\
1060             h1= ((a&0xFCFCFCFCUL)>>2)\
1061               + ((b&0xFCFCFCFCUL)>>2);\
1062             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1063             pixels+=line_size;\
1064             block +=line_size;\
1065             a= AV_RN32(pixels  );\
1066             b= AV_RN32(pixels+1);\
1067             l0=  (a&0x03030303UL)\
1068                + (b&0x03030303UL)\
1069                + 0x02020202UL;\
1070             h0= ((a&0xFCFCFCFCUL)>>2)\
1071               + ((b&0xFCFCFCFCUL)>>2);\
1072             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1073             pixels+=line_size;\
1074             block +=line_size;\
1075         }\
1076 }\
1077 \
1078 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1079 {\
1080     int j;\
1081     for(j=0; j<2; j++){\
1082         int i;\
1083         const uint32_t a= AV_RN32(pixels  );\
1084         const uint32_t b= AV_RN32(pixels+1);\
1085         uint32_t l0=  (a&0x03030303UL)\
1086                     + (b&0x03030303UL)\
1087                     + 0x02020202UL;\
1088         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1089                    + ((b&0xFCFCFCFCUL)>>2);\
1090         uint32_t l1,h1;\
1091 \
1092         pixels+=line_size;\
1093         for(i=0; i<h; i+=2){\
1094             uint32_t a= AV_RN32(pixels  );\
1095             uint32_t b= AV_RN32(pixels+1);\
1096             l1=  (a&0x03030303UL)\
1097                + (b&0x03030303UL);\
1098             h1= ((a&0xFCFCFCFCUL)>>2)\
1099               + ((b&0xFCFCFCFCUL)>>2);\
1100             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1101             pixels+=line_size;\
1102             block +=line_size;\
1103             a= AV_RN32(pixels  );\
1104             b= AV_RN32(pixels+1);\
1105             l0=  (a&0x03030303UL)\
1106                + (b&0x03030303UL)\
1107                + 0x02020202UL;\
1108             h0= ((a&0xFCFCFCFCUL)>>2)\
1109               + ((b&0xFCFCFCFCUL)>>2);\
1110             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1111             pixels+=line_size;\
1112             block +=line_size;\
1113         }\
1114         pixels+=4-line_size*(h+1);\
1115         block +=4-line_size*h;\
1116     }\
1117 }\
1118 \
1119 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1120 {\
1121     int j;\
1122     for(j=0; j<2; j++){\
1123         int i;\
1124         const uint32_t a= AV_RN32(pixels  );\
1125         const uint32_t b= AV_RN32(pixels+1);\
1126         uint32_t l0=  (a&0x03030303UL)\
1127                     + (b&0x03030303UL)\
1128                     + 0x01010101UL;\
1129         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1130                    + ((b&0xFCFCFCFCUL)>>2);\
1131         uint32_t l1,h1;\
1132 \
1133         pixels+=line_size;\
1134         for(i=0; i<h; i+=2){\
1135             uint32_t a= AV_RN32(pixels  );\
1136             uint32_t b= AV_RN32(pixels+1);\
1137             l1=  (a&0x03030303UL)\
1138                + (b&0x03030303UL);\
1139             h1= ((a&0xFCFCFCFCUL)>>2)\
1140               + ((b&0xFCFCFCFCUL)>>2);\
1141             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1142             pixels+=line_size;\
1143             block +=line_size;\
1144             a= AV_RN32(pixels  );\
1145             b= AV_RN32(pixels+1);\
1146             l0=  (a&0x03030303UL)\
1147                + (b&0x03030303UL)\
1148                + 0x01010101UL;\
1149             h0= ((a&0xFCFCFCFCUL)>>2)\
1150               + ((b&0xFCFCFCFCUL)>>2);\
1151             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1152             pixels+=line_size;\
1153             block +=line_size;\
1154         }\
1155         pixels+=4-line_size*(h+1);\
1156         block +=4-line_size*h;\
1157     }\
1158 }\
1159 \
1160 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1161 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1162 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1163 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1164 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1165 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1166 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1167 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1168
1169 #define op_avg(a, b) a = rnd_avg32(a, b)
1170 #endif
1171 #define op_put(a, b) a = b
1172
1173 PIXOP2(avg, op_avg)
1174 PIXOP2(put, op_put)
1175 #undef op_avg
1176 #undef op_put
1177
1178 #define avg2(a,b) ((a+b+1)>>1)
1179 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1180
1181 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1182     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1183 }
1184
1185 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1186     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1187 }
1188
1189 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1190 {
1191     const int A=(16-x16)*(16-y16);
1192     const int B=(   x16)*(16-y16);
1193     const int C=(16-x16)*(   y16);
1194     const int D=(   x16)*(   y16);
1195     int i;
1196
1197     for(i=0; i<h; i++)
1198     {
1199         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1200         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1201         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1202         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1203         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1204         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1205         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1206         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1207         dst+= stride;
1208         src+= stride;
1209     }
1210 }
1211
1212 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1213                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1214 {
1215     int y, vx, vy;
1216     const int s= 1<<shift;
1217
1218     width--;
1219     height--;
1220
1221     for(y=0; y<h; y++){
1222         int x;
1223
1224         vx= ox;
1225         vy= oy;
1226         for(x=0; x<8; x++){ //XXX FIXME optimize
1227             int src_x, src_y, frac_x, frac_y, index;
1228
1229             src_x= vx>>16;
1230             src_y= vy>>16;
1231             frac_x= src_x&(s-1);
1232             frac_y= src_y&(s-1);
1233             src_x>>=shift;
1234             src_y>>=shift;
1235
1236             if((unsigned)src_x < width){
1237                 if((unsigned)src_y < height){
1238                     index= src_x + src_y*stride;
1239                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1240                                            + src[index       +1]*   frac_x )*(s-frac_y)
1241                                         + (  src[index+stride  ]*(s-frac_x)
1242                                            + src[index+stride+1]*   frac_x )*   frac_y
1243                                         + r)>>(shift*2);
1244                 }else{
1245                     index= src_x + av_clip(src_y, 0, height)*stride;
1246                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1247                                           + src[index       +1]*   frac_x )*s
1248                                         + r)>>(shift*2);
1249                 }
1250             }else{
1251                 if((unsigned)src_y < height){
1252                     index= av_clip(src_x, 0, width) + src_y*stride;
1253                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1254                                            + src[index+stride  ]*   frac_y )*s
1255                                         + r)>>(shift*2);
1256                 }else{
1257                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1258                     dst[y*stride + x]=    src[index         ];
1259                 }
1260             }
1261
1262             vx+= dxx;
1263             vy+= dyx;
1264         }
1265         ox += dxy;
1266         oy += dyy;
1267     }
1268 }
1269
1270 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1271     switch(width){
1272     case 2: put_pixels2_c (dst, src, stride, height); break;
1273     case 4: put_pixels4_c (dst, src, stride, height); break;
1274     case 8: put_pixels8_c (dst, src, stride, height); break;
1275     case 16:put_pixels16_c(dst, src, stride, height); break;
1276     }
1277 }
1278
1279 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280     int i,j;
1281     for (i=0; i < height; i++) {
1282       for (j=0; j < width; j++) {
1283         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1284       }
1285       src += stride;
1286       dst += stride;
1287     }
1288 }
1289
1290 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291     int i,j;
1292     for (i=0; i < height; i++) {
1293       for (j=0; j < width; j++) {
1294         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1295       }
1296       src += stride;
1297       dst += stride;
1298     }
1299 }
1300
1301 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1302     int i,j;
1303     for (i=0; i < height; i++) {
1304       for (j=0; j < width; j++) {
1305         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1306       }
1307       src += stride;
1308       dst += stride;
1309     }
1310 }
1311
1312 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1313     int i,j;
1314     for (i=0; i < height; i++) {
1315       for (j=0; j < width; j++) {
1316         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1317       }
1318       src += stride;
1319       dst += stride;
1320     }
1321 }
1322
1323 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1324     int i,j;
1325     for (i=0; i < height; i++) {
1326       for (j=0; j < width; j++) {
1327         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1328       }
1329       src += stride;
1330       dst += stride;
1331     }
1332 }
1333
1334 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1335     int i,j;
1336     for (i=0; i < height; i++) {
1337       for (j=0; j < width; j++) {
1338         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1339       }
1340       src += stride;
1341       dst += stride;
1342     }
1343 }
1344
1345 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1346     int i,j;
1347     for (i=0; i < height; i++) {
1348       for (j=0; j < width; j++) {
1349         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1350       }
1351       src += stride;
1352       dst += stride;
1353     }
1354 }
1355
1356 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1357     int i,j;
1358     for (i=0; i < height; i++) {
1359       for (j=0; j < width; j++) {
1360         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1361       }
1362       src += stride;
1363       dst += stride;
1364     }
1365 }
1366
1367 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368     switch(width){
1369     case 2: avg_pixels2_c (dst, src, stride, height); break;
1370     case 4: avg_pixels4_c (dst, src, stride, height); break;
1371     case 8: avg_pixels8_c (dst, src, stride, height); break;
1372     case 16:avg_pixels16_c(dst, src, stride, height); break;
1373     }
1374 }
1375
1376 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377     int i,j;
1378     for (i=0; i < height; i++) {
1379       for (j=0; j < width; j++) {
1380         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1381       }
1382       src += stride;
1383       dst += stride;
1384     }
1385 }
1386
1387 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1388     int i,j;
1389     for (i=0; i < height; i++) {
1390       for (j=0; j < width; j++) {
1391         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1392       }
1393       src += stride;
1394       dst += stride;
1395     }
1396 }
1397
1398 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1399     int i,j;
1400     for (i=0; i < height; i++) {
1401       for (j=0; j < width; j++) {
1402         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1403       }
1404       src += stride;
1405       dst += stride;
1406     }
1407 }
1408
1409 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1410     int i,j;
1411     for (i=0; i < height; i++) {
1412       for (j=0; j < width; j++) {
1413         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1414       }
1415       src += stride;
1416       dst += stride;
1417     }
1418 }
1419
1420 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1421     int i,j;
1422     for (i=0; i < height; i++) {
1423       for (j=0; j < width; j++) {
1424         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1425       }
1426       src += stride;
1427       dst += stride;
1428     }
1429 }
1430
1431 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1432     int i,j;
1433     for (i=0; i < height; i++) {
1434       for (j=0; j < width; j++) {
1435         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1436       }
1437       src += stride;
1438       dst += stride;
1439     }
1440 }
1441
1442 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1443     int i,j;
1444     for (i=0; i < height; i++) {
1445       for (j=0; j < width; j++) {
1446         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1447       }
1448       src += stride;
1449       dst += stride;
1450     }
1451 }
1452
1453 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1454     int i,j;
1455     for (i=0; i < height; i++) {
1456       for (j=0; j < width; j++) {
1457         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1458       }
1459       src += stride;
1460       dst += stride;
1461     }
1462 }
1463 #if 0
1464 #define TPEL_WIDTH(width)\
1465 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1466     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1467 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1468     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1469 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1470     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1471 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1472     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1473 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1474     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1475 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1476     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1477 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1478     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1479 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1480     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1481 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1482     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1483 #endif
1484
1485 #define H264_CHROMA_MC(OPNAME, OP)\
1486 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1487     const int A=(8-x)*(8-y);\
1488     const int B=(  x)*(8-y);\
1489     const int C=(8-x)*(  y);\
1490     const int D=(  x)*(  y);\
1491     int i;\
1492     \
1493     assert(x<8 && y<8 && x>=0 && y>=0);\
1494 \
1495     if(D){\
1496         for(i=0; i<h; i++){\
1497             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1498             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1499             dst+= stride;\
1500             src+= stride;\
1501         }\
1502     }else{\
1503         const int E= B+C;\
1504         const int step= C ? stride : 1;\
1505         for(i=0; i<h; i++){\
1506             OP(dst[0], (A*src[0] + E*src[step+0]));\
1507             OP(dst[1], (A*src[1] + E*src[step+1]));\
1508             dst+= stride;\
1509             src+= stride;\
1510         }\
1511     }\
1512 }\
1513 \
1514 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1515     const int A=(8-x)*(8-y);\
1516     const int B=(  x)*(8-y);\
1517     const int C=(8-x)*(  y);\
1518     const int D=(  x)*(  y);\
1519     int i;\
1520     \
1521     assert(x<8 && y<8 && x>=0 && y>=0);\
1522 \
1523     if(D){\
1524         for(i=0; i<h; i++){\
1525             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1526             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1527             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1528             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1529             dst+= stride;\
1530             src+= stride;\
1531         }\
1532     }else{\
1533         const int E= B+C;\
1534         const int step= C ? stride : 1;\
1535         for(i=0; i<h; i++){\
1536             OP(dst[0], (A*src[0] + E*src[step+0]));\
1537             OP(dst[1], (A*src[1] + E*src[step+1]));\
1538             OP(dst[2], (A*src[2] + E*src[step+2]));\
1539             OP(dst[3], (A*src[3] + E*src[step+3]));\
1540             dst+= stride;\
1541             src+= stride;\
1542         }\
1543     }\
1544 }\
1545 \
1546 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1547     const int A=(8-x)*(8-y);\
1548     const int B=(  x)*(8-y);\
1549     const int C=(8-x)*(  y);\
1550     const int D=(  x)*(  y);\
1551     int i;\
1552     \
1553     assert(x<8 && y<8 && x>=0 && y>=0);\
1554 \
1555     if(D){\
1556         for(i=0; i<h; i++){\
1557             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1558             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1559             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1560             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1561             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1562             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1563             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1564             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1565             dst+= stride;\
1566             src+= stride;\
1567         }\
1568     }else{\
1569         const int E= B+C;\
1570         const int step= C ? stride : 1;\
1571         for(i=0; i<h; i++){\
1572             OP(dst[0], (A*src[0] + E*src[step+0]));\
1573             OP(dst[1], (A*src[1] + E*src[step+1]));\
1574             OP(dst[2], (A*src[2] + E*src[step+2]));\
1575             OP(dst[3], (A*src[3] + E*src[step+3]));\
1576             OP(dst[4], (A*src[4] + E*src[step+4]));\
1577             OP(dst[5], (A*src[5] + E*src[step+5]));\
1578             OP(dst[6], (A*src[6] + E*src[step+6]));\
1579             OP(dst[7], (A*src[7] + E*src[step+7]));\
1580             dst+= stride;\
1581             src+= stride;\
1582         }\
1583     }\
1584 }
1585
1586 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1587 #define op_put(a, b) a = (((b) + 32)>>6)
1588
1589 H264_CHROMA_MC(put_       , op_put)
1590 H264_CHROMA_MC(avg_       , op_avg)
1591 #undef op_avg
1592 #undef op_put
1593
1594 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1595     const int A=(8-x)*(8-y);
1596     const int B=(  x)*(8-y);
1597     const int C=(8-x)*(  y);
1598     const int D=(  x)*(  y);
1599     int i;
1600
1601     assert(x<8 && y<8 && x>=0 && y>=0);
1602
1603     for(i=0; i<h; i++)
1604     {
1605         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1606         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1607         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1608         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1609         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1610         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1611         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1612         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1613         dst+= stride;
1614         src+= stride;
1615     }
1616 }
1617
1618 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1619     const int A=(8-x)*(8-y);
1620     const int B=(  x)*(8-y);
1621     const int C=(8-x)*(  y);
1622     const int D=(  x)*(  y);
1623     int i;
1624
1625     assert(x<8 && y<8 && x>=0 && y>=0);
1626
1627     for(i=0; i<h; i++)
1628     {
1629         dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1630         dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1631         dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1632         dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1633         dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1634         dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1635         dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1636         dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1637         dst+= stride;
1638         src+= stride;
1639     }
1640 }
1641
1642 #define QPEL_MC(r, OPNAME, RND, OP) \
1643 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1644     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1645     int i;\
1646     for(i=0; i<h; i++)\
1647     {\
1648         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1649         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1650         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1651         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1652         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1653         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1654         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1655         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1656         dst+=dstStride;\
1657         src+=srcStride;\
1658     }\
1659 }\
1660 \
1661 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1662     const int w=8;\
1663     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1664     int i;\
1665     for(i=0; i<w; i++)\
1666     {\
1667         const int src0= src[0*srcStride];\
1668         const int src1= src[1*srcStride];\
1669         const int src2= src[2*srcStride];\
1670         const int src3= src[3*srcStride];\
1671         const int src4= src[4*srcStride];\
1672         const int src5= src[5*srcStride];\
1673         const int src6= src[6*srcStride];\
1674         const int src7= src[7*srcStride];\
1675         const int src8= src[8*srcStride];\
1676         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1677         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1678         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1679         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1680         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1681         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1682         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1683         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1684         dst++;\
1685         src++;\
1686     }\
1687 }\
1688 \
1689 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1690     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1691     int i;\
1692     \
1693     for(i=0; i<h; i++)\
1694     {\
1695         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1696         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1697         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1698         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1699         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1700         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1701         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1702         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1703         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1704         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1705         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1706         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1707         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1708         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1709         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1710         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1711         dst+=dstStride;\
1712         src+=srcStride;\
1713     }\
1714 }\
1715 \
1716 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1717     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1718     int i;\
1719     const int w=16;\
1720     for(i=0; i<w; i++)\
1721     {\
1722         const int src0= src[0*srcStride];\
1723         const int src1= src[1*srcStride];\
1724         const int src2= src[2*srcStride];\
1725         const int src3= src[3*srcStride];\
1726         const int src4= src[4*srcStride];\
1727         const int src5= src[5*srcStride];\
1728         const int src6= src[6*srcStride];\
1729         const int src7= src[7*srcStride];\
1730         const int src8= src[8*srcStride];\
1731         const int src9= src[9*srcStride];\
1732         const int src10= src[10*srcStride];\
1733         const int src11= src[11*srcStride];\
1734         const int src12= src[12*srcStride];\
1735         const int src13= src[13*srcStride];\
1736         const int src14= src[14*srcStride];\
1737         const int src15= src[15*srcStride];\
1738         const int src16= src[16*srcStride];\
1739         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1740         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1741         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1742         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1743         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1744         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1745         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1746         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1747         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1748         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1749         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1750         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1751         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1752         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1753         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1754         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1755         dst++;\
1756         src++;\
1757     }\
1758 }\
1759 \
1760 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1761     OPNAME ## pixels8_c(dst, src, stride, 8);\
1762 }\
1763 \
1764 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1765     uint8_t half[64];\
1766     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1767     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1768 }\
1769 \
1770 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1771     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1772 }\
1773 \
1774 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1775     uint8_t half[64];\
1776     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1777     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1778 }\
1779 \
1780 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1781     uint8_t full[16*9];\
1782     uint8_t half[64];\
1783     copy_block9(full, src, 16, stride, 9);\
1784     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1785     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1786 }\
1787 \
1788 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1789     uint8_t full[16*9];\
1790     copy_block9(full, src, 16, stride, 9);\
1791     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1792 }\
1793 \
1794 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1795     uint8_t full[16*9];\
1796     uint8_t half[64];\
1797     copy_block9(full, src, 16, stride, 9);\
1798     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1799     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1800 }\
1801 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1802     uint8_t full[16*9];\
1803     uint8_t halfH[72];\
1804     uint8_t halfV[64];\
1805     uint8_t halfHV[64];\
1806     copy_block9(full, src, 16, stride, 9);\
1807     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1809     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1810     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1811 }\
1812 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1813     uint8_t full[16*9];\
1814     uint8_t halfH[72];\
1815     uint8_t halfHV[64];\
1816     copy_block9(full, src, 16, stride, 9);\
1817     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1818     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1819     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1820     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1821 }\
1822 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1823     uint8_t full[16*9];\
1824     uint8_t halfH[72];\
1825     uint8_t halfV[64];\
1826     uint8_t halfHV[64];\
1827     copy_block9(full, src, 16, stride, 9);\
1828     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1829     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1830     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1831     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1832 }\
1833 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1834     uint8_t full[16*9];\
1835     uint8_t halfH[72];\
1836     uint8_t halfHV[64];\
1837     copy_block9(full, src, 16, stride, 9);\
1838     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1839     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1840     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1841     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1842 }\
1843 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1844     uint8_t full[16*9];\
1845     uint8_t halfH[72];\
1846     uint8_t halfV[64];\
1847     uint8_t halfHV[64];\
1848     copy_block9(full, src, 16, stride, 9);\
1849     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1850     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1851     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1852     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1853 }\
1854 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1855     uint8_t full[16*9];\
1856     uint8_t halfH[72];\
1857     uint8_t halfHV[64];\
1858     copy_block9(full, src, 16, stride, 9);\
1859     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1860     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1861     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1862     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1863 }\
1864 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1865     uint8_t full[16*9];\
1866     uint8_t halfH[72];\
1867     uint8_t halfV[64];\
1868     uint8_t halfHV[64];\
1869     copy_block9(full, src, 16, stride, 9);\
1870     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1871     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1872     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1873     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1874 }\
1875 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1876     uint8_t full[16*9];\
1877     uint8_t halfH[72];\
1878     uint8_t halfHV[64];\
1879     copy_block9(full, src, 16, stride, 9);\
1880     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1881     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1882     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1883     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1884 }\
1885 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1886     uint8_t halfH[72];\
1887     uint8_t halfHV[64];\
1888     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1889     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1890     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1891 }\
1892 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1893     uint8_t halfH[72];\
1894     uint8_t halfHV[64];\
1895     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1896     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1897     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1898 }\
1899 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1900     uint8_t full[16*9];\
1901     uint8_t halfH[72];\
1902     uint8_t halfV[64];\
1903     uint8_t halfHV[64];\
1904     copy_block9(full, src, 16, stride, 9);\
1905     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1906     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1907     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1908     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1909 }\
1910 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1911     uint8_t full[16*9];\
1912     uint8_t halfH[72];\
1913     copy_block9(full, src, 16, stride, 9);\
1914     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1915     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1916     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1917 }\
1918 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1919     uint8_t full[16*9];\
1920     uint8_t halfH[72];\
1921     uint8_t halfV[64];\
1922     uint8_t halfHV[64];\
1923     copy_block9(full, src, 16, stride, 9);\
1924     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1925     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1926     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1927     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1928 }\
1929 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1930     uint8_t full[16*9];\
1931     uint8_t halfH[72];\
1932     copy_block9(full, src, 16, stride, 9);\
1933     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1934     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1935     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1936 }\
1937 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1938     uint8_t halfH[72];\
1939     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1940     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1941 }\
1942 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1943     OPNAME ## pixels16_c(dst, src, stride, 16);\
1944 }\
1945 \
1946 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1947     uint8_t half[256];\
1948     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1949     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1950 }\
1951 \
1952 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1953     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1954 }\
1955 \
1956 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1957     uint8_t half[256];\
1958     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1959     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1960 }\
1961 \
1962 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1963     uint8_t full[24*17];\
1964     uint8_t half[256];\
1965     copy_block17(full, src, 24, stride, 17);\
1966     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1967     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1968 }\
1969 \
1970 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1971     uint8_t full[24*17];\
1972     copy_block17(full, src, 24, stride, 17);\
1973     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1974 }\
1975 \
1976 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1977     uint8_t full[24*17];\
1978     uint8_t half[256];\
1979     copy_block17(full, src, 24, stride, 17);\
1980     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1981     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1982 }\
1983 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1984     uint8_t full[24*17];\
1985     uint8_t halfH[272];\
1986     uint8_t halfV[256];\
1987     uint8_t halfHV[256];\
1988     copy_block17(full, src, 24, stride, 17);\
1989     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1990     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1991     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1992     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1993 }\
1994 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1995     uint8_t full[24*17];\
1996     uint8_t halfH[272];\
1997     uint8_t halfHV[256];\
1998     copy_block17(full, src, 24, stride, 17);\
1999     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2000     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2001     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2002     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2003 }\
2004 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2005     uint8_t full[24*17];\
2006     uint8_t halfH[272];\
2007     uint8_t halfV[256];\
2008     uint8_t halfHV[256];\
2009     copy_block17(full, src, 24, stride, 17);\
2010     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2011     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2012     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2013     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2014 }\
2015 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2016     uint8_t full[24*17];\
2017     uint8_t halfH[272];\
2018     uint8_t halfHV[256];\
2019     copy_block17(full, src, 24, stride, 17);\
2020     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2021     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2022     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2023     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2024 }\
2025 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2026     uint8_t full[24*17];\
2027     uint8_t halfH[272];\
2028     uint8_t halfV[256];\
2029     uint8_t halfHV[256];\
2030     copy_block17(full, src, 24, stride, 17);\
2031     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2032     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2033     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2034     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2035 }\
2036 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2037     uint8_t full[24*17];\
2038     uint8_t halfH[272];\
2039     uint8_t halfHV[256];\
2040     copy_block17(full, src, 24, stride, 17);\
2041     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2042     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2043     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2044     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2045 }\
2046 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2047     uint8_t full[24*17];\
2048     uint8_t halfH[272];\
2049     uint8_t halfV[256];\
2050     uint8_t halfHV[256];\
2051     copy_block17(full, src, 24, stride, 17);\
2052     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2053     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2054     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2055     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2056 }\
2057 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2058     uint8_t full[24*17];\
2059     uint8_t halfH[272];\
2060     uint8_t halfHV[256];\
2061     copy_block17(full, src, 24, stride, 17);\
2062     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2063     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2064     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2065     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2066 }\
2067 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2068     uint8_t halfH[272];\
2069     uint8_t halfHV[256];\
2070     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2071     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2072     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2073 }\
2074 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2075     uint8_t halfH[272];\
2076     uint8_t halfHV[256];\
2077     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2078     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2079     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2080 }\
2081 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2082     uint8_t full[24*17];\
2083     uint8_t halfH[272];\
2084     uint8_t halfV[256];\
2085     uint8_t halfHV[256];\
2086     copy_block17(full, src, 24, stride, 17);\
2087     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2088     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2089     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2090     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2091 }\
2092 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2093     uint8_t full[24*17];\
2094     uint8_t halfH[272];\
2095     copy_block17(full, src, 24, stride, 17);\
2096     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2097     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2098     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2099 }\
2100 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2101     uint8_t full[24*17];\
2102     uint8_t halfH[272];\
2103     uint8_t halfV[256];\
2104     uint8_t halfHV[256];\
2105     copy_block17(full, src, 24, stride, 17);\
2106     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2107     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2108     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2109     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2110 }\
2111 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2112     uint8_t full[24*17];\
2113     uint8_t halfH[272];\
2114     copy_block17(full, src, 24, stride, 17);\
2115     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2116     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2117     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2118 }\
2119 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2120     uint8_t halfH[272];\
2121     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2122     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2123 }
2124
2125 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2126 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2127 #define op_put(a, b) a = cm[((b) + 16)>>5]
2128 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2129
2130 QPEL_MC(0, put_       , _       , op_put)
2131 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2132 QPEL_MC(0, avg_       , _       , op_avg)
2133 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2134 #undef op_avg
2135 #undef op_avg_no_rnd
2136 #undef op_put
2137 #undef op_put_no_rnd
2138
2139 #if 1
2140 #define H264_LOWPASS(OPNAME, OP, OP2) \
2141 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2142     const int h=2;\
2143     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2144     int i;\
2145     for(i=0; i<h; i++)\
2146     {\
2147         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2148         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2149         dst+=dstStride;\
2150         src+=srcStride;\
2151     }\
2152 }\
2153 \
2154 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2155     const int w=2;\
2156     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2157     int i;\
2158     for(i=0; i<w; i++)\
2159     {\
2160         const int srcB= src[-2*srcStride];\
2161         const int srcA= src[-1*srcStride];\
2162         const int src0= src[0 *srcStride];\
2163         const int src1= src[1 *srcStride];\
2164         const int src2= src[2 *srcStride];\
2165         const int src3= src[3 *srcStride];\
2166         const int src4= src[4 *srcStride];\
2167         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2168         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2169         dst++;\
2170         src++;\
2171     }\
2172 }\
2173 \
2174 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2175     const int h=2;\
2176     const int w=2;\
2177     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2178     int i;\
2179     src -= 2*srcStride;\
2180     for(i=0; i<h+5; i++)\
2181     {\
2182         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2183         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2184         tmp+=tmpStride;\
2185         src+=srcStride;\
2186     }\
2187     tmp -= tmpStride*(h+5-2);\
2188     for(i=0; i<w; i++)\
2189     {\
2190         const int tmpB= tmp[-2*tmpStride];\
2191         const int tmpA= tmp[-1*tmpStride];\
2192         const int tmp0= tmp[0 *tmpStride];\
2193         const int tmp1= tmp[1 *tmpStride];\
2194         const int tmp2= tmp[2 *tmpStride];\
2195         const int tmp3= tmp[3 *tmpStride];\
2196         const int tmp4= tmp[4 *tmpStride];\
2197         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2198         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2199         dst++;\
2200         tmp++;\
2201     }\
2202 }\
2203 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2204     const int h=4;\
2205     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2206     int i;\
2207     for(i=0; i<h; i++)\
2208     {\
2209         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2210         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2211         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2212         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2213         dst+=dstStride;\
2214         src+=srcStride;\
2215     }\
2216 }\
2217 \
2218 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2219     const int w=4;\
2220     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2221     int i;\
2222     for(i=0; i<w; i++)\
2223     {\
2224         const int srcB= src[-2*srcStride];\
2225         const int srcA= src[-1*srcStride];\
2226         const int src0= src[0 *srcStride];\
2227         const int src1= src[1 *srcStride];\
2228         const int src2= src[2 *srcStride];\
2229         const int src3= src[3 *srcStride];\
2230         const int src4= src[4 *srcStride];\
2231         const int src5= src[5 *srcStride];\
2232         const int src6= src[6 *srcStride];\
2233         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2234         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2235         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2236         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2237         dst++;\
2238         src++;\
2239     }\
2240 }\
2241 \
2242 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2243     const int h=4;\
2244     const int w=4;\
2245     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2246     int i;\
2247     src -= 2*srcStride;\
2248     for(i=0; i<h+5; i++)\
2249     {\
2250         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2251         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2252         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2253         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2254         tmp+=tmpStride;\
2255         src+=srcStride;\
2256     }\
2257     tmp -= tmpStride*(h+5-2);\
2258     for(i=0; i<w; i++)\
2259     {\
2260         const int tmpB= tmp[-2*tmpStride];\
2261         const int tmpA= tmp[-1*tmpStride];\
2262         const int tmp0= tmp[0 *tmpStride];\
2263         const int tmp1= tmp[1 *tmpStride];\
2264         const int tmp2= tmp[2 *tmpStride];\
2265         const int tmp3= tmp[3 *tmpStride];\
2266         const int tmp4= tmp[4 *tmpStride];\
2267         const int tmp5= tmp[5 *tmpStride];\
2268         const int tmp6= tmp[6 *tmpStride];\
2269         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2270         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2271         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2272         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2273         dst++;\
2274         tmp++;\
2275     }\
2276 }\
2277 \
2278 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2279     const int h=8;\
2280     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2281     int i;\
2282     for(i=0; i<h; i++)\
2283     {\
2284         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2285         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2286         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2287         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2288         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2289         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2290         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2291         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2292         dst+=dstStride;\
2293         src+=srcStride;\
2294     }\
2295 }\
2296 \
2297 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2298     const int w=8;\
2299     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2300     int i;\
2301     for(i=0; i<w; i++)\
2302     {\
2303         const int srcB= src[-2*srcStride];\
2304         const int srcA= src[-1*srcStride];\
2305         const int src0= src[0 *srcStride];\
2306         const int src1= src[1 *srcStride];\
2307         const int src2= src[2 *srcStride];\
2308         const int src3= src[3 *srcStride];\
2309         const int src4= src[4 *srcStride];\
2310         const int src5= src[5 *srcStride];\
2311         const int src6= src[6 *srcStride];\
2312         const int src7= src[7 *srcStride];\
2313         const int src8= src[8 *srcStride];\
2314         const int src9= src[9 *srcStride];\
2315         const int src10=src[10*srcStride];\
2316         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2317         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2318         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2319         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2320         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2321         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2322         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2323         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2324         dst++;\
2325         src++;\
2326     }\
2327 }\
2328 \
2329 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2330     const int h=8;\
2331     const int w=8;\
2332     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2333     int i;\
2334     src -= 2*srcStride;\
2335     for(i=0; i<h+5; i++)\
2336     {\
2337         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2338         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2339         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2340         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2341         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2342         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2343         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2344         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2345         tmp+=tmpStride;\
2346         src+=srcStride;\
2347     }\
2348     tmp -= tmpStride*(h+5-2);\
2349     for(i=0; i<w; i++)\
2350     {\
2351         const int tmpB= tmp[-2*tmpStride];\
2352         const int tmpA= tmp[-1*tmpStride];\
2353         const int tmp0= tmp[0 *tmpStride];\
2354         const int tmp1= tmp[1 *tmpStride];\
2355         const int tmp2= tmp[2 *tmpStride];\
2356         const int tmp3= tmp[3 *tmpStride];\
2357         const int tmp4= tmp[4 *tmpStride];\
2358         const int tmp5= tmp[5 *tmpStride];\
2359         const int tmp6= tmp[6 *tmpStride];\
2360         const int tmp7= tmp[7 *tmpStride];\
2361         const int tmp8= tmp[8 *tmpStride];\
2362         const int tmp9= tmp[9 *tmpStride];\
2363         const int tmp10=tmp[10*tmpStride];\
2364         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2365         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2366         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2367         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2368         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2369         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2370         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2371         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2372         dst++;\
2373         tmp++;\
2374     }\
2375 }\
2376 \
2377 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2378     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2379     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2380     src += 8*srcStride;\
2381     dst += 8*dstStride;\
2382     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2383     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2384 }\
2385 \
2386 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2387     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2388     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2389     src += 8*srcStride;\
2390     dst += 8*dstStride;\
2391     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2392     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2393 }\
2394 \
2395 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2396     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2397     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2398     src += 8*srcStride;\
2399     dst += 8*dstStride;\
2400     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2401     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2402 }\
2403
2404 #define H264_MC(OPNAME, SIZE) \
2405 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2406     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2407 }\
2408 \
2409 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2410     uint8_t half[SIZE*SIZE];\
2411     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2412     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2413 }\
2414 \
2415 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2416     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2417 }\
2418 \
2419 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2420     uint8_t half[SIZE*SIZE];\
2421     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2422     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2423 }\
2424 \
2425 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2426     uint8_t full[SIZE*(SIZE+5)];\
2427     uint8_t * const full_mid= full + SIZE*2;\
2428     uint8_t half[SIZE*SIZE];\
2429     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2430     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2431     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2432 }\
2433 \
2434 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2435     uint8_t full[SIZE*(SIZE+5)];\
2436     uint8_t * const full_mid= full + SIZE*2;\
2437     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2438     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2439 }\
2440 \
2441 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2442     uint8_t full[SIZE*(SIZE+5)];\
2443     uint8_t * const full_mid= full + SIZE*2;\
2444     uint8_t half[SIZE*SIZE];\
2445     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2446     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2447     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2448 }\
2449 \
2450 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2451     uint8_t full[SIZE*(SIZE+5)];\
2452     uint8_t * const full_mid= full + SIZE*2;\
2453     uint8_t halfH[SIZE*SIZE];\
2454     uint8_t halfV[SIZE*SIZE];\
2455     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2456     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2457     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2458     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2459 }\
2460 \
2461 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2462     uint8_t full[SIZE*(SIZE+5)];\
2463     uint8_t * const full_mid= full + SIZE*2;\
2464     uint8_t halfH[SIZE*SIZE];\
2465     uint8_t halfV[SIZE*SIZE];\
2466     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2467     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2468     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2469     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2470 }\
2471 \
2472 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2473     uint8_t full[SIZE*(SIZE+5)];\
2474     uint8_t * const full_mid= full + SIZE*2;\
2475     uint8_t halfH[SIZE*SIZE];\
2476     uint8_t halfV[SIZE*SIZE];\
2477     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2478     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2479     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2480     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2481 }\
2482 \
2483 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2484     uint8_t full[SIZE*(SIZE+5)];\
2485     uint8_t * const full_mid= full + SIZE*2;\
2486     uint8_t halfH[SIZE*SIZE];\
2487     uint8_t halfV[SIZE*SIZE];\
2488     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2489     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2490     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2491     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2492 }\
2493 \
2494 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2495     int16_t tmp[SIZE*(SIZE+5)];\
2496     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2497 }\
2498 \
2499 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2500     int16_t tmp[SIZE*(SIZE+5)];\
2501     uint8_t halfH[SIZE*SIZE];\
2502     uint8_t halfHV[SIZE*SIZE];\
2503     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2504     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2505     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2506 }\
2507 \
2508 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2509     int16_t tmp[SIZE*(SIZE+5)];\
2510     uint8_t halfH[SIZE*SIZE];\
2511     uint8_t halfHV[SIZE*SIZE];\
2512     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2513     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2514     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2515 }\
2516 \
2517 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2518     uint8_t full[SIZE*(SIZE+5)];\
2519     uint8_t * const full_mid= full + SIZE*2;\
2520     int16_t tmp[SIZE*(SIZE+5)];\
2521     uint8_t halfV[SIZE*SIZE];\
2522     uint8_t halfHV[SIZE*SIZE];\
2523     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2524     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2525     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2526     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2527 }\
2528 \
2529 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2530     uint8_t full[SIZE*(SIZE+5)];\
2531     uint8_t * const full_mid= full + SIZE*2;\
2532     int16_t tmp[SIZE*(SIZE+5)];\
2533     uint8_t halfV[SIZE*SIZE];\
2534     uint8_t halfHV[SIZE*SIZE];\
2535     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2536     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2537     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2538     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2539 }\
2540
2541 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2542 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2543 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2544 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2545 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2546
2547 H264_LOWPASS(put_       , op_put, op2_put)
2548 H264_LOWPASS(avg_       , op_avg, op2_avg)
2549 H264_MC(put_, 2)
2550 H264_MC(put_, 4)
2551 H264_MC(put_, 8)
2552 H264_MC(put_, 16)
2553 H264_MC(avg_, 4)
2554 H264_MC(avg_, 8)
2555 H264_MC(avg_, 16)
2556
2557 #undef op_avg
2558 #undef op_put
2559 #undef op2_avg
2560 #undef op2_put
2561 #endif
2562
2563 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2564     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2565     int i;
2566
2567     for(i=0; i<h; i++){
2568         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2569         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2570         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2571         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2572         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2573         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2574         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2575         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2576         dst+=dstStride;
2577         src+=srcStride;
2578     }
2579 }
2580
2581 #if CONFIG_CAVS_DECODER
2582 /* AVS specific */
2583 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2584     put_pixels8_c(dst, src, stride, 8);
2585 }
2586 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2587     avg_pixels8_c(dst, src, stride, 8);
2588 }
2589 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2590     put_pixels16_c(dst, src, stride, 16);
2591 }
2592 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2593     avg_pixels16_c(dst, src, stride, 16);
2594 }
2595 #endif /* CONFIG_CAVS_DECODER */
2596
2597 #if CONFIG_VC1_DECODER
2598 /* VC-1 specific */
2599 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2600     put_pixels8_c(dst, src, stride, 8);
2601 }
2602 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2603     avg_pixels8_c(dst, src, stride, 8);
2604 }
2605 #endif /* CONFIG_VC1_DECODER */
2606
2607 #if CONFIG_RV40_DECODER
2608 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2609     put_pixels16_xy2_c(dst, src, stride, 16);
2610 }
2611 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2612     avg_pixels16_xy2_c(dst, src, stride, 16);
2613 }
2614 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2615     put_pixels8_xy2_c(dst, src, stride, 8);
2616 }
2617 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2618     avg_pixels8_xy2_c(dst, src, stride, 8);
2619 }
2620 #endif /* CONFIG_RV40_DECODER */
2621
2622 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2623     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2624     int i;
2625
2626     for(i=0; i<w; i++){
2627         const int src_1= src[ -srcStride];
2628         const int src0 = src[0          ];
2629         const int src1 = src[  srcStride];
2630         const int src2 = src[2*srcStride];
2631         const int src3 = src[3*srcStride];
2632         const int src4 = src[4*srcStride];
2633         const int src5 = src[5*srcStride];
2634         const int src6 = src[6*srcStride];
2635         const int src7 = src[7*srcStride];
2636         const int src8 = src[8*srcStride];
2637         const int src9 = src[9*srcStride];
2638         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2639         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2640         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2641         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2642         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2643         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2644         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2645         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2646         src++;
2647         dst++;
2648     }
2649 }
2650
2651 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2652     put_pixels8_c(dst, src, stride, 8);
2653 }
2654
2655 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2656     uint8_t half[64];
2657     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2658     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2659 }
2660
2661 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2662     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2663 }
2664
2665 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2666     uint8_t half[64];
2667     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2668     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2669 }
2670
2671 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2672     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2673 }
2674
2675 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2676     uint8_t halfH[88];
2677     uint8_t halfV[64];
2678     uint8_t halfHV[64];
2679     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2680     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2681     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2682     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2683 }
2684 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2685     uint8_t halfH[88];
2686     uint8_t halfV[64];
2687     uint8_t halfHV[64];
2688     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2689     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2690     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2691     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2692 }
2693 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2694     uint8_t halfH[88];
2695     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2696     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2697 }
2698
2699 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2700     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2701     int x;
2702     const int strength= ff_h263_loop_filter_strength[qscale];
2703
2704     for(x=0; x<8; x++){
2705         int d1, d2, ad1;
2706         int p0= src[x-2*stride];
2707         int p1= src[x-1*stride];
2708         int p2= src[x+0*stride];
2709         int p3= src[x+1*stride];
2710         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2711
2712         if     (d<-2*strength) d1= 0;
2713         else if(d<-  strength) d1=-2*strength - d;
2714         else if(d<   strength) d1= d;
2715         else if(d< 2*strength) d1= 2*strength - d;
2716         else                   d1= 0;
2717
2718         p1 += d1;
2719         p2 -= d1;
2720         if(p1&256) p1= ~(p1>>31);
2721         if(p2&256) p2= ~(p2>>31);
2722
2723         src[x-1*stride] = p1;
2724         src[x+0*stride] = p2;
2725
2726         ad1= FFABS(d1)>>1;
2727
2728         d2= av_clip((p0-p3)/4, -ad1, ad1);
2729
2730         src[x-2*stride] = p0 - d2;
2731         src[x+  stride] = p3 + d2;
2732     }
2733     }
2734 }
2735
2736 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2737     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2738     int y;
2739     const int strength= ff_h263_loop_filter_strength[qscale];
2740
2741     for(y=0; y<8; y++){
2742         int d1, d2, ad1;
2743         int p0= src[y*stride-2];
2744         int p1= src[y*stride-1];
2745         int p2= src[y*stride+0];
2746         int p3= src[y*stride+1];
2747         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2748
2749         if     (d<-2*strength) d1= 0;
2750         else if(d<-  strength) d1=-2*strength - d;
2751         else if(d<   strength) d1= d;
2752         else if(d< 2*strength) d1= 2*strength - d;
2753         else                   d1= 0;
2754
2755         p1 += d1;
2756         p2 -= d1;
2757         if(p1&256) p1= ~(p1>>31);
2758         if(p2&256) p2= ~(p2>>31);
2759
2760         src[y*stride-1] = p1;
2761         src[y*stride+0] = p2;
2762
2763         ad1= FFABS(d1)>>1;
2764
2765         d2= av_clip((p0-p3)/4, -ad1, ad1);
2766
2767         src[y*stride-2] = p0 - d2;
2768         src[y*stride+1] = p3 + d2;
2769     }
2770     }
2771 }
2772
2773 static void h261_loop_filter_c(uint8_t *src, int stride){
2774     int x,y,xy,yz;
2775     int temp[64];
2776
2777     for(x=0; x<8; x++){
2778         temp[x      ] = 4*src[x           ];
2779         temp[x + 7*8] = 4*src[x + 7*stride];
2780     }
2781     for(y=1; y<7; y++){
2782         for(x=0; x<8; x++){
2783             xy = y * stride + x;
2784             yz = y * 8 + x;
2785             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2786         }
2787     }
2788
2789     for(y=0; y<8; y++){
2790         src[  y*stride] = (temp[  y*8] + 2)>>2;
2791         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2792         for(x=1; x<7; x++){
2793             xy = y * stride + x;
2794             yz = y * 8 + x;
2795             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2796         }
2797     }
2798 }
2799
2800 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2801 {
2802     int s, i;
2803
2804     s = 0;
2805     for(i=0;i<h;i++) {
2806         s += abs(pix1[0] - pix2[0]);
2807         s += abs(pix1[1] - pix2[1]);
2808         s += abs(pix1[2] - pix2[2]);
2809         s += abs(pix1[3] - pix2[3]);
2810         s += abs(pix1[4] - pix2[4]);
2811         s += abs(pix1[5] - pix2[5]);
2812         s += abs(pix1[6] - pix2[6]);
2813         s += abs(pix1[7] - pix2[7]);
2814         s += abs(pix1[8] - pix2[8]);
2815         s += abs(pix1[9] - pix2[9]);
2816         s += abs(pix1[10] - pix2[10]);
2817         s += abs(pix1[11] - pix2[11]);
2818         s += abs(pix1[12] - pix2[12]);
2819         s += abs(pix1[13] - pix2[13]);
2820         s += abs(pix1[14] - pix2[14]);
2821         s += abs(pix1[15] - pix2[15]);
2822         pix1 += line_size;
2823         pix2 += line_size;
2824     }
2825     return s;
2826 }
2827
2828 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2829 {
2830     int s, i;
2831
2832     s = 0;
2833     for(i=0;i<h;i++) {
2834         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2835         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2836         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2837         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2838         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2839         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2840         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2841         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2842         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2843         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2844         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2845         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2846         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2847         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2848         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2849         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2850         pix1 += line_size;
2851         pix2 += line_size;
2852     }
2853     return s;
2854 }
2855
2856 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2857 {
2858     int s, i;
2859     uint8_t *pix3 = pix2 + line_size;
2860
2861     s = 0;
2862     for(i=0;i<h;i++) {
2863         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2864         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2865         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2866         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2867         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2868         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2869         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2870         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2871         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2872         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2873         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2874         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2875         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2876         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2877         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2878         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2879         pix1 += line_size;
2880         pix2 += line_size;
2881         pix3 += line_size;
2882     }
2883     return s;
2884 }
2885
2886 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2887 {
2888     int s, i;
2889     uint8_t *pix3 = pix2 + line_size;
2890
2891     s = 0;
2892     for(i=0;i<h;i++) {
2893         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2894         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2895         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2896         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2897         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2898         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2899         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2900         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2901         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2902         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2903         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2904         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2905         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2906         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2907         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2908         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2909         pix1 += line_size;
2910         pix2 += line_size;
2911         pix3 += line_size;
2912     }
2913     return s;
2914 }
2915
2916 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2917 {
2918     int s, i;
2919
2920     s = 0;
2921     for(i=0;i<h;i++) {
2922         s += abs(pix1[0] - pix2[0]);
2923         s += abs(pix1[1] - pix2[1]);
2924         s += abs(pix1[2] - pix2[2]);
2925         s += abs(pix1[3] - pix2[3]);
2926         s += abs(pix1[4] - pix2[4]);
2927         s += abs(pix1[5] - pix2[5]);
2928         s += abs(pix1[6] - pix2[6]);
2929         s += abs(pix1[7] - pix2[7]);
2930         pix1 += line_size;
2931         pix2 += line_size;
2932     }
2933     return s;
2934 }
2935
2936 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2937 {
2938     int s, i;
2939
2940     s = 0;
2941     for(i=0;i<h;i++) {
2942         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2943         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2944         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2945         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2946         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2947         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2948         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2949         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2950         pix1 += line_size;
2951         pix2 += line_size;
2952     }
2953     return s;
2954 }
2955
2956 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2957 {
2958     int s, i;
2959     uint8_t *pix3 = pix2 + line_size;
2960
2961     s = 0;
2962     for(i=0;i<h;i++) {
2963         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2964         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2965         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2966         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2967         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2968         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2969         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2970         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2971         pix1 += line_size;
2972         pix2 += line_size;
2973         pix3 += line_size;
2974     }
2975     return s;
2976 }
2977
2978 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2979 {
2980     int s, i;
2981     uint8_t *pix3 = pix2 + line_size;
2982
2983     s = 0;
2984     for(i=0;i<h;i++) {
2985         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2986         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2987         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2988         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2989         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2990         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2991         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2992         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2993         pix1 += line_size;
2994         pix2 += line_size;
2995         pix3 += line_size;
2996     }
2997     return s;
2998 }
2999
3000 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3001     MpegEncContext *c = v;
3002     int score1=0;
3003     int score2=0;
3004     int x,y;
3005
3006     for(y=0; y<h; y++){
3007         for(x=0; x<16; x++){
3008             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3009         }
3010         if(y+1<h){
3011             for(x=0; x<15; x++){
3012                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3013                              - s1[x+1] + s1[x+1+stride])
3014                         -FFABS(  s2[x  ] - s2[x  +stride]
3015                              - s2[x+1] + s2[x+1+stride]);
3016             }
3017         }
3018         s1+= stride;
3019         s2+= stride;
3020     }
3021
3022     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3023     else  return score1 + FFABS(score2)*8;
3024 }
3025
3026 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3027     MpegEncContext *c = v;
3028     int score1=0;
3029     int score2=0;
3030     int x,y;
3031
3032     for(y=0; y<h; y++){
3033         for(x=0; x<8; x++){
3034             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3035         }
3036         if(y+1<h){
3037             for(x=0; x<7; x++){
3038                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3039                              - s1[x+1] + s1[x+1+stride])
3040                         -FFABS(  s2[x  ] - s2[x  +stride]
3041                              - s2[x+1] + s2[x+1+stride]);
3042             }
3043         }
3044         s1+= stride;
3045         s2+= stride;
3046     }
3047
3048     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3049     else  return score1 + FFABS(score2)*8;
3050 }
3051
3052 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3053     int i;
3054     unsigned int sum=0;
3055
3056     for(i=0; i<8*8; i++){
3057         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3058         int w= weight[i];
3059         b>>= RECON_SHIFT;
3060         assert(-512<b && b<512);
3061
3062         sum += (w*b)*(w*b)>>4;
3063     }
3064     return sum>>2;
3065 }
3066
3067 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3068     int i;
3069
3070     for(i=0; i<8*8; i++){
3071         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3072     }
3073 }
3074
3075 /**
3076  * permutes an 8x8 block.
3077  * @param block the block which will be permuted according to the given permutation vector
3078  * @param permutation the permutation vector
3079  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3080  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3081  *                  (inverse) permutated to scantable order!
3082  */
3083 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3084 {
3085     int i;
3086     DCTELEM temp[64];
3087
3088     if(last<=0) return;
3089     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3090
3091     for(i=0; i<=last; i++){
3092         const int j= scantable[i];
3093         temp[j]= block[j];
3094         block[j]=0;
3095     }
3096
3097     for(i=0; i<=last; i++){
3098         const int j= scantable[i];
3099         const int perm_j= permutation[j];
3100         block[perm_j]= temp[j];
3101     }
3102 }
3103
3104 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3105     return 0;
3106 }
3107
3108 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3109     int i;
3110
3111     memset(cmp, 0, sizeof(void*)*6);
3112
3113     for(i=0; i<6; i++){
3114         switch(type&0xFF){
3115         case FF_CMP_SAD:
3116             cmp[i]= c->sad[i];
3117             break;
3118         case FF_CMP_SATD:
3119             cmp[i]= c->hadamard8_diff[i];
3120             break;
3121         case FF_CMP_SSE:
3122             cmp[i]= c->sse[i];
3123             break;
3124         case FF_CMP_DCT:
3125             cmp[i]= c->dct_sad[i];
3126             break;
3127         case FF_CMP_DCT264:
3128             cmp[i]= c->dct264_sad[i];
3129             break;
3130         case FF_CMP_DCTMAX:
3131             cmp[i]= c->dct_max[i];
3132             break;
3133         case FF_CMP_PSNR:
3134             cmp[i]= c->quant_psnr[i];
3135             break;
3136         case FF_CMP_BIT:
3137             cmp[i]= c->bit[i];
3138             break;
3139         case FF_CMP_RD:
3140             cmp[i]= c->rd[i];
3141             break;
3142         case FF_CMP_VSAD:
3143             cmp[i]= c->vsad[i];
3144             break;
3145         case FF_CMP_VSSE:
3146             cmp[i]= c->vsse[i];
3147             break;
3148         case FF_CMP_ZERO:
3149             cmp[i]= zero_cmp;
3150             break;
3151         case FF_CMP_NSSE:
3152             cmp[i]= c->nsse[i];
3153             break;
3154 #if CONFIG_DWT
3155         case FF_CMP_W53:
3156             cmp[i]= c->w53[i];
3157             break;
3158         case FF_CMP_W97:
3159             cmp[i]= c->w97[i];
3160             break;
3161 #endif
3162         default:
3163             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3164         }
3165     }
3166 }
3167
3168 static void clear_block_c(DCTELEM *block)
3169 {
3170     memset(block, 0, sizeof(DCTELEM)*64);
3171 }
3172
3173 /**
3174  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3175  */
3176 static void clear_blocks_c(DCTELEM *blocks)
3177 {
3178     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3179 }
3180
3181 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3182     long i;
3183     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3184         long a = *(long*)(src+i);
3185         long b = *(long*)(dst+i);
3186         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3187     }
3188     for(; i<w; i++)
3189         dst[i+0] += src[i+0];
3190 }
3191
3192 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3193     long i;
3194     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3195         long a = *(long*)(src1+i);
3196         long b = *(long*)(src2+i);
3197         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3198     }
3199     for(; i<w; i++)
3200         dst[i] = src1[i]+src2[i];
3201 }
3202
3203 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3204     long i;
3205 #if !HAVE_FAST_UNALIGNED
3206     if((long)src2 & (sizeof(long)-1)){
3207         for(i=0; i+7<w; i+=8){
3208             dst[i+0] = src1[i+0]-src2[i+0];
3209             dst[i+1] = src1[i+1]-src2[i+1];
3210             dst[i+2] = src1[i+2]-src2[i+2];
3211             dst[i+3] = src1[i+3]-src2[i+3];
3212             dst[i+4] = src1[i+4]-src2[i+4];
3213             dst[i+5] = src1[i+5]-src2[i+5];
3214             dst[i+6] = src1[i+6]-src2[i+6];
3215             dst[i+7] = src1[i+7]-src2[i+7];
3216         }
3217     }else
3218 #endif
3219     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3220         long a = *(long*)(src1+i);
3221         long b = *(long*)(src2+i);
3222         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3223     }
3224     for(; i<w; i++)
3225         dst[i+0] = src1[i+0]-src2[i+0];
3226 }
3227
3228 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3229     int i;
3230     uint8_t l, lt;
3231
3232     l= *left;
3233     lt= *left_top;
3234
3235     for(i=0; i<w; i++){
3236         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3237         lt= src1[i];
3238         dst[i]= l;
3239     }
3240
3241     *left= l;
3242     *left_top= lt;
3243 }
3244
3245 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3246     int i;
3247     uint8_t l, lt;
3248
3249     l= *left;
3250     lt= *left_top;
3251
3252     for(i=0; i<w; i++){
3253         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3254         lt= src1[i];
3255         l= src2[i];
3256         dst[i]= l - pred;
3257     }
3258
3259     *left= l;
3260     *left_top= lt;
3261 }
3262
3263 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3264     int i;
3265
3266     for(i=0; i<w-1; i++){
3267         acc+= src[i];
3268         dst[i]= acc;
3269         i++;
3270         acc+= src[i];
3271         dst[i]= acc;
3272     }
3273
3274     for(; i<w; i++){
3275         acc+= src[i];
3276         dst[i]= acc;
3277     }
3278
3279     return acc;
3280 }
3281
3282 #if HAVE_BIGENDIAN
3283 #define B 3
3284 #define G 2
3285 #define R 1
3286 #define A 0
3287 #else
3288 #define B 0
3289 #define G 1
3290 #define R 2
3291 #define A 3
3292 #endif
3293 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3294     int i;
3295     int r,g,b,a;
3296     r= *red;
3297     g= *green;
3298     b= *blue;
3299     a= *alpha;
3300
3301     for(i=0; i<w; i++){
3302         b+= src[4*i+B];
3303         g+= src[4*i+G];
3304         r+= src[4*i+R];
3305         a+= src[4*i+A];
3306
3307         dst[4*i+B]= b;
3308         dst[4*i+G]= g;
3309         dst[4*i+R]= r;
3310         dst[4*i+A]= a;
3311     }
3312
3313     *red= r;
3314     *green= g;
3315     *blue= b;
3316     *alpha= a;
3317 }
3318 #undef B
3319 #undef G
3320 #undef R
3321 #undef A
3322
3323 #define BUTTERFLY2(o1,o2,i1,i2) \
3324 o1= (i1)+(i2);\
3325 o2= (i1)-(i2);
3326
3327 #define BUTTERFLY1(x,y) \
3328 {\
3329     int a,b;\
3330     a= x;\
3331     b= y;\
3332     x= a+b;\
3333     y= a-b;\
3334 }
3335
3336 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3337
3338 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3339     int i;
3340     int temp[64];
3341     int sum=0;
3342
3343     assert(h==8);
3344
3345     for(i=0; i<8; i++){
3346         //FIXME try pointer walks
3347         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3348         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3349         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3350         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3351
3352         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3353         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3354         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3355         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3356
3357         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3358         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3359         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3360         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3361     }
3362
3363     for(i=0; i<8; i++){
3364         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3365         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3366         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3367         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3368
3369         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3370         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3371         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3372         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3373
3374         sum +=
3375              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3376             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3377             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3378             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3379     }
3380 #if 0
3381 static int maxi=0;
3382 if(sum>maxi){
3383     maxi=sum;
3384     printf("MAX:%d\n", maxi);
3385 }
3386 #endif
3387     return sum;
3388 }
3389
3390 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3391     int i;
3392     int temp[64];
3393     int sum=0;
3394
3395     assert(h==8);
3396
3397     for(i=0; i<8; i++){
3398         //FIXME try pointer walks
3399         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3400         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3401         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3402         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3403
3404         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3405         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3406         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3407         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3408
3409         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3410         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3411         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3412         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3413     }
3414
3415     for(i=0; i<8; i++){
3416         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3417         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3418         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3419         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3420
3421         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3422         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3423         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3424         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3425
3426         sum +=
3427              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3428             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3429             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3430             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3431     }
3432
3433     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3434
3435     return sum;
3436 }
3437
3438 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3439     MpegEncContext * const s= (MpegEncContext *)c;
3440     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3441
3442     assert(h==8);
3443
3444     s->dsp.diff_pixels(temp, src1, src2, stride);
3445     s->dsp.fdct(temp);
3446     return s->dsp.sum_abs_dctelem(temp);
3447 }
3448
3449 #if CONFIG_GPL
3450 #define DCT8_1D {\
3451     const int s07 = SRC(0) + SRC(7);\
3452     const int s16 = SRC(1) + SRC(6);\
3453     const int s25 = SRC(2) + SRC(5);\
3454     const int s34 = SRC(3) + SRC(4);\
3455     const int a0 = s07 + s34;\
3456     const int a1 = s16 + s25;\
3457     const int a2 = s07 - s34;\
3458     const int a3 = s16 - s25;\
3459     const int d07 = SRC(0) - SRC(7);\
3460     const int d16 = SRC(1) - SRC(6);\
3461     const int d25 = SRC(2) - SRC(5);\
3462     const int d34 = SRC(3) - SRC(4);\
3463     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3464     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3465     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3466     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3467     DST(0,  a0 + a1     ) ;\
3468     DST(1,  a4 + (a7>>2)) ;\
3469     DST(2,  a2 + (a3>>1)) ;\
3470     DST(3,  a5 + (a6>>2)) ;\
3471     DST(4,  a0 - a1     ) ;\
3472     DST(5,  a6 - (a5>>2)) ;\
3473     DST(6, (a2>>1) - a3 ) ;\
3474     DST(7, (a4>>2) - a7 ) ;\
3475 }
3476
3477 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3478     MpegEncContext * const s= (MpegEncContext *)c;
3479     DCTELEM dct[8][8];
3480     int i;
3481     int sum=0;
3482
3483     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3484
3485 #define SRC(x) dct[i][x]
3486 #define DST(x,v) dct[i][x]= v
3487     for( i = 0; i < 8; i++ )
3488         DCT8_1D
3489 #undef SRC
3490 #undef DST
3491
3492 #define SRC(x) dct[x][i]
3493 #define DST(x,v) sum += FFABS(v)
3494     for( i = 0; i < 8; i++ )
3495         DCT8_1D
3496 #undef SRC
3497 #undef DST
3498     return sum;
3499 }
3500 #endif
3501
3502 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3503     MpegEncContext * const s= (MpegEncContext *)c;
3504     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3505     int sum=0, i;
3506
3507     assert(h==8);
3508
3509     s->dsp.diff_pixels(temp, src1, src2, stride);
3510     s->dsp.fdct(temp);
3511
3512     for(i=0; i<64; i++)
3513         sum= FFMAX(sum, FFABS(temp[i]));
3514
3515     return sum;
3516 }
3517
3518 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3519     MpegEncContext * const s= (MpegEncContext *)c;
3520     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3521     DCTELEM * const bak = temp+64;
3522     int sum=0, i;
3523
3524     assert(h==8);
3525     s->mb_intra=0;
3526
3527     s->dsp.diff_pixels(temp, src1, src2, stride);
3528
3529     memcpy(bak, temp, 64*sizeof(DCTELEM));
3530
3531     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3532     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3533     ff_simple_idct(temp); //FIXME
3534
3535     for(i=0; i<64; i++)
3536         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3537
3538     return sum;
3539 }
3540
3541 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3542     MpegEncContext * const s= (MpegEncContext *)c;
3543     const uint8_t *scantable= s->intra_scantable.permutated;
3544     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3545     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3546     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3547     int i, last, run, bits, level, distortion, start_i;
3548     const int esc_length= s->ac_esc_length;
3549     uint8_t * length;
3550     uint8_t * last_length;
3551
3552     assert(h==8);
3553
3554     copy_block8(lsrc1, src1, 8, stride, 8);
3555     copy_block8(lsrc2, src2, 8, stride, 8);
3556
3557     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3558
3559     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3560
3561     bits=0;
3562
3563     if (s->mb_intra) {
3564         start_i = 1;
3565         length     = s->intra_ac_vlc_length;
3566         last_length= s->intra_ac_vlc_last_length;
3567         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3568     } else {
3569         start_i = 0;
3570         length     = s->inter_ac_vlc_length;
3571         last_length= s->inter_ac_vlc_last_length;
3572     }
3573
3574     if(last>=start_i){
3575         run=0;
3576         for(i=start_i; i<last; i++){
3577             int j= scantable[i];
3578             level= temp[j];
3579
3580             if(level){
3581                 level+=64;
3582                 if((level&(~127)) == 0){
3583                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3584                 }else
3585                     bits+= esc_length;
3586                 run=0;
3587             }else
3588                 run++;
3589         }
3590         i= scantable[last];
3591
3592         level= temp[i] + 64;
3593
3594         assert(level - 64);
3595
3596         if((level&(~127)) == 0){
3597             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3598         }else
3599             bits+= esc_length;
3600
3601     }
3602
3603     if(last>=0){
3604         if(s->mb_intra)
3605             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3606         else
3607             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3608     }
3609
3610     s->dsp.idct_add(lsrc2, 8, temp);
3611
3612     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3613
3614     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3615 }
3616
3617 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3618     MpegEncContext * const s= (MpegEncContext *)c;
3619     const uint8_t *scantable= s->intra_scantable.permutated;
3620     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3621     int i, last, run, bits, level, start_i;
3622     const int esc_length= s->ac_esc_length;
3623     uint8_t * length;
3624     uint8_t * last_length;
3625
3626     assert(h==8);
3627
3628     s->dsp.diff_pixels(temp, src1, src2, stride);
3629
3630     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3631
3632     bits=0;
3633
3634     if (s->mb_intra) {
3635         start_i = 1;
3636         length     = s->intra_ac_vlc_length;
3637         last_length= s->intra_ac_vlc_last_length;
3638         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3639     } else {
3640         start_i = 0;
3641         length     = s->inter_ac_vlc_length;
3642         last_length= s->inter_ac_vlc_last_length;
3643     }
3644
3645     if(last>=start_i){
3646         run=0;
3647         for(i=start_i; i<last; i++){
3648             int j= scantable[i];
3649             level= temp[j];
3650
3651             if(level){
3652                 level+=64;
3653                 if((level&(~127)) == 0){
3654                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3655                 }else
3656                     bits+= esc_length;
3657                 run=0;
3658             }else
3659                 run++;
3660         }
3661         i= scantable[last];
3662
3663         level= temp[i] + 64;
3664
3665         assert(level - 64);
3666
3667         if((level&(~127)) == 0){
3668             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3669         }else
3670             bits+= esc_length;
3671     }
3672
3673     return bits;
3674 }
3675
3676 #define VSAD_INTRA(size) \
3677 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3678     int score=0;                                                                                            \
3679     int x,y;                                                                                                \
3680                                                                                                             \
3681     for(y=1; y<h; y++){                                                                                     \
3682         for(x=0; x<size; x+=4){                                                                             \
3683             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3684                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3685         }                                                                                                   \
3686         s+= stride;                                                                                         \
3687     }                                                                                                       \
3688                                                                                                             \
3689     return score;                                                                                           \
3690 }
3691 VSAD_INTRA(8)
3692 VSAD_INTRA(16)
3693
3694 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3695     int score=0;
3696     int x,y;
3697
3698     for(y=1; y<h; y++){
3699         for(x=0; x<16; x++){
3700             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3701         }
3702         s1+= stride;
3703         s2+= stride;
3704     }
3705
3706     return score;
3707 }
3708
3709 #define SQ(a) ((a)*(a))
3710 #define VSSE_INTRA(size) \
3711 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3712     int score=0;                                                                                            \
3713     int x,y;                                                                                                \
3714                                                                                                             \
3715     for(y=1; y<h; y++){                                                                                     \
3716         for(x=0; x<size; x+=4){                                                                               \
3717             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3718                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3719         }                                                                                                   \
3720         s+= stride;                                                                                         \
3721     }                                                                                                       \
3722                                                                                                             \
3723     return score;                                                                                           \
3724 }
3725 VSSE_INTRA(8)
3726 VSSE_INTRA(16)
3727
3728 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3729     int score=0;
3730     int x,y;
3731
3732     for(y=1; y<h; y++){
3733         for(x=0; x<16; x++){
3734             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3735         }
3736         s1+= stride;
3737         s2+= stride;
3738     }
3739
3740     return score;
3741 }
3742
3743 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3744                                int size){
3745     int score=0;
3746     int i;
3747     for(i=0; i<size; i++)
3748         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3749     return score;
3750 }
3751
3752 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3753 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3754 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3755 #if CONFIG_GPL
3756 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3757 #endif
3758 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3759 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3760 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3761 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3762
3763 static void vector_fmul_c(float *dst, const float *src, int len){
3764     int i;
3765     for(i=0; i<len; i++)
3766         dst[i] *= src[i];
3767 }
3768
3769 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3770     int i;
3771     src1 += len-1;
3772     for(i=0; i<len; i++)
3773         dst[i] = src0[i] * src1[-i];
3774 }
3775
3776 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3777     int i;
3778     for(i=0; i<len; i++)
3779         dst[i] = src0[i] * src1[i] + src2[i];
3780 }
3781
3782 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3783     int i,j;
3784     dst += len;
3785     win += len;
3786     src0+= len;
3787     for(i=-len, j=len-1; i<0; i++, j--) {
3788         float s0 = src0[i];
3789         float s1 = src1[j];
3790         float wi = win[i];
3791         float wj = win[j];
3792         dst[i] = s0*wj - s1*wi + add_bias;
3793         dst[j] = s0*wi + s1*wj + add_bias;
3794     }
3795 }
3796
3797 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3798                                  int len)
3799 {
3800     int i;
3801     for (i = 0; i < len; i++)
3802         dst[i] = src[i] * mul;
3803 }
3804
3805 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3806                                       const float **sv, float mul, int len)
3807 {
3808     int i;
3809     for (i = 0; i < len; i += 2, sv++) {
3810         dst[i  ] = src[i  ] * sv[0][0] * mul;
3811         dst[i+1] = src[i+1] * sv[0][1] * mul;
3812     }
3813 }
3814
3815 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3816                                       const float **sv, float mul, int len)
3817 {
3818     int i;
3819     for (i = 0; i < len; i += 4, sv++) {
3820         dst[i  ] = src[i  ] * sv[0][0] * mul;
3821         dst[i+1] = src[i+1] * sv[0][1] * mul;
3822         dst[i+2] = src[i+2] * sv[0][2] * mul;
3823         dst[i+3] = src[i+3] * sv[0][3] * mul;
3824     }
3825 }
3826
3827 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3828                                int len)
3829 {
3830     int i;
3831     for (i = 0; i < len; i += 2, sv++) {
3832         dst[i  ] = sv[0][0] * mul;
3833         dst[i+1] = sv[0][1] * mul;
3834     }
3835 }
3836
3837 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3838                                int len)
3839 {
3840     int i;
3841     for (i = 0; i < len; i += 4, sv++) {
3842         dst[i  ] = sv[0][0] * mul;
3843         dst[i+1] = sv[0][1] * mul;
3844         dst[i+2] = sv[0][2] * mul;
3845         dst[i+3] = sv[0][3] * mul;
3846     }
3847 }
3848
3849 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3850                                 int len)
3851 {
3852     int i;
3853     for (i = 0; i < len; i++) {
3854         float t = v1[i] - v2[i];
3855         v1[i] += v2[i];
3856         v2[i] = t;
3857     }
3858 }
3859
3860 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3861 {
3862     float p = 0.0;
3863     int i;
3864
3865     for (i = 0; i < len; i++)
3866         p += v1[i] * v2[i];
3867
3868     return p;
3869 }
3870
3871 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3872     int i;
3873     for(i=0; i<len; i++)
3874         dst[i] = src[i] * mul;
3875 }
3876
3877 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3878                    uint32_t maxi, uint32_t maxisign)
3879 {
3880
3881     if(a > mini) return mini;
3882     else if((a^(1<<31)) > maxisign) return maxi;
3883     else return a;
3884 }
3885
3886 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3887     int i;
3888     uint32_t mini = *(uint32_t*)min;
3889     uint32_t maxi = *(uint32_t*)max;
3890     uint32_t maxisign = maxi ^ (1<<31);
3891     uint32_t *dsti = (uint32_t*)dst;
3892     const uint32_t *srci = (const uint32_t*)src;
3893     for(i=0; i<len; i+=8) {
3894         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3895         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3896         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3897         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3898         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3899         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3900         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3901         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3902     }
3903 }
3904 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3905     int i;
3906     if(min < 0 && max > 0) {
3907         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3908     } else {
3909         for(i=0; i < len; i+=8) {
3910             dst[i    ] = av_clipf(src[i    ], min, max);
3911             dst[i + 1] = av_clipf(src[i + 1], min, max);
3912             dst[i + 2] = av_clipf(src[i + 2], min, max);
3913             dst[i + 3] = av_clipf(src[i + 3], min, max);
3914             dst[i + 4] = av_clipf(src[i + 4], min, max);
3915             dst[i + 5] = av_clipf(src[i + 5], min, max);
3916             dst[i + 6] = av_clipf(src[i + 6], min, max);
3917             dst[i + 7] = av_clipf(src[i + 7], min, max);
3918         }
3919     }
3920 }
3921
3922 static av_always_inline int float_to_int16_one(const float *src){
3923     int_fast32_t tmp = *(const int32_t*)src;
3924     if(tmp & 0xf0000){
3925         tmp = (0x43c0ffff - tmp)>>31;
3926         // is this faster on some gcc/cpu combinations?
3927 //      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3928 //      else                 tmp = 0;
3929     }
3930     return tmp - 0x8000;
3931 }
3932
3933 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3934     int i;
3935     for(i=0; i<len; i++)
3936         dst[i] = float_to_int16_one(src+i);
3937 }
3938
3939 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3940     int i,j,c;
3941     if(channels==2){
3942         for(i=0; i<len; i++){
3943             dst[2*i]   = float_to_int16_one(src[0]+i);
3944             dst[2*i+1] = float_to_int16_one(src[1]+i);
3945         }
3946     }else{
3947         for(c=0; c<channels; c++)
3948             for(i=0, j=c; i<len; i++, j+=channels)
3949                 dst[j] = float_to_int16_one(src[c]+i);
3950     }
3951 }
3952
3953 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3954 {
3955     int res = 0;
3956
3957     while (order--)
3958         res += (*v1++ * *v2++) >> shift;
3959
3960     return res;
3961 }
3962
3963 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3964 {
3965     int res = 0;
3966     while (order--) {
3967         res   += *v1 * *v2++;
3968         *v1++ += mul * *v3++;
3969     }
3970     return res;
3971 }
3972
3973 #define W0 2048
3974 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3975 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3976 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3977 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3978 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3979 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3980 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3981
3982 static void wmv2_idct_row(short * b)
3983 {
3984     int s1,s2;
3985     int a0,a1,a2,a3,a4,a5,a6,a7;
3986     /*step 1*/
3987     a1 = W1*b[1]+W7*b[7];
3988     a7 = W7*b[1]-W1*b[7];
3989     a5 = W5*b[5]+W3*b[3];
3990     a3 = W3*b[5]-W5*b[3];
3991     a2 = W2*b[2]+W6*b[6];
3992     a6 = W6*b[2]-W2*b[6];
3993     a0 = W0*b[0]+W0*b[4];
3994     a4 = W0*b[0]-W0*b[4];
3995     /*step 2*/
3996     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3997     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3998     /*step 3*/
3999     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4000     b[1] = (a4+a6 +s1   + (1<<7))>>8;
4001     b[2] = (a4-a6 +s2   + (1<<7))>>8;
4002     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4003     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4004     b[5] = (a4-a6 -s2   + (1<<7))>>8;
4005     b[6] = (a4+a6 -s1   + (1<<7))>>8;
4006     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4007 }
4008 static void wmv2_idct_col(short * b)
4009 {
4010     int s1,s2;
4011     int a0,a1,a2,a3,a4,a5,a6,a7;
4012     /*step 1, with extended precision*/
4013     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4014     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4015     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4016     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4017     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4018     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4019     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4020     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4021     /*step 2*/
4022     s1 = (181*(a1-a5+a7-a3)+128)>>8;
4023     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4024     /*step 3*/
4025     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4026     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4027     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4028     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4029
4030     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4031     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4032     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4033     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4034 }
4035 void ff_wmv2_idct_c(short * block){
4036     int i;
4037
4038     for(i=0;i<64;i+=8){
4039         wmv2_idct_row(block+i);
4040     }
4041     for(i=0;i<8;i++){
4042         wmv2_idct_col(block+i);
4043     }
4044 }
4045 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4046  converted */
4047 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4048 {
4049     ff_wmv2_idct_c(block);
4050     put_pixels_clamped_c(block, dest, line_size);
4051 }
4052 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4053 {
4054     ff_wmv2_idct_c(block);
4055     add_pixels_clamped_c(block, dest, line_size);
4056 }
4057 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4058 {
4059     j_rev_dct (block);
4060     put_pixels_clamped_c(block, dest, line_size);
4061 }
4062 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4063 {
4064     j_rev_dct (block);
4065     add_pixels_clamped_c(block, dest, line_size);
4066 }
4067
4068 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4069 {
4070     j_rev_dct4 (block);
4071     put_pixels_clamped4_c(block, dest, line_size);
4072 }
4073 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4074 {
4075     j_rev_dct4 (block);
4076     add_pixels_clamped4_c(block, dest, line_size);
4077 }
4078
4079 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4080 {
4081     j_rev_dct2 (block);
4082     put_pixels_clamped2_c(block, dest, line_size);
4083 }
4084 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4085 {
4086     j_rev_dct2 (block);
4087     add_pixels_clamped2_c(block, dest, line_size);
4088 }
4089
4090 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4091 {
4092     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4093
4094     dest[0] = cm[(block[0] + 4)>>3];
4095 }
4096 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4097 {
4098     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4099
4100     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4101 }
4102
4103 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4104
4105 /* init static data */
4106 av_cold void dsputil_static_init(void)
4107 {
4108     int i;
4109
4110     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4111     for(i=0;i<MAX_NEG_CROP;i++) {
4112         ff_cropTbl[i] = 0;
4113         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4114     }
4115
4116     for(i=0;i<512;i++) {
4117         ff_squareTbl[i] = (i - 256) * (i - 256);
4118     }
4119
4120     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4121 }
4122
4123 int ff_check_alignment(void){
4124     static int did_fail=0;
4125     DECLARE_ALIGNED(16, int, aligned);
4126
4127     if((intptr_t)&aligned & 15){
4128         if(!did_fail){
4129 #if HAVE_MMX || HAVE_ALTIVEC
4130             av_log(NULL, AV_LOG_ERROR,
4131                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4132                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4133                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4134                 "Do not report crashes to FFmpeg developers.\n");
4135 #endif
4136             did_fail=1;
4137         }
4138         return -1;
4139     }
4140     return 0;
4141 }
4142
4143 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4144 {
4145     int i;
4146
4147     ff_check_alignment();
4148
4149 #if CONFIG_ENCODERS
4150     if(avctx->dct_algo==FF_DCT_FASTINT) {
4151         c->fdct = fdct_ifast;
4152         c->fdct248 = fdct_ifast248;
4153     }
4154     else if(avctx->dct_algo==FF_DCT_FAAN) {
4155         c->fdct = ff_faandct;
4156         c->fdct248 = ff_faandct248;
4157     }
4158     else {
4159         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4160         c->fdct248 = ff_fdct248_islow;
4161     }
4162 #endif //CONFIG_ENCODERS
4163
4164     if(avctx->lowres==1){
4165         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4166             c->idct_put= ff_jref_idct4_put;
4167             c->idct_add= ff_jref_idct4_add;
4168         }else{
4169             c->idct_put= ff_h264_lowres_idct_put_c;
4170             c->idct_add= ff_h264_lowres_idct_add_c;
4171         }
4172         c->idct    = j_rev_dct4;
4173         c->idct_permutation_type= FF_NO_IDCT_PERM;
4174     }else if(avctx->lowres==2){
4175         c->idct_put= ff_jref_idct2_put;
4176         c->idct_add= ff_jref_idct2_add;
4177         c->idct    = j_rev_dct2;
4178         c->idct_permutation_type= FF_NO_IDCT_PERM;
4179     }else if(avctx->lowres==3){
4180         c->idct_put= ff_jref_idct1_put;
4181         c->idct_add= ff_jref_idct1_add;
4182         c->idct    = j_rev_dct1;
4183         c->idct_permutation_type= FF_NO_IDCT_PERM;
4184     }else{
4185         if(avctx->idct_algo==FF_IDCT_INT){
4186             c->idct_put= ff_jref_idct_put;
4187             c->idct_add= ff_jref_idct_add;
4188             c->idct    = j_rev_dct;
4189             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4190         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4191                 avctx->idct_algo==FF_IDCT_VP3){
4192             c->idct_put= ff_vp3_idct_put_c;
4193             c->idct_add= ff_vp3_idct_add_c;
4194             c->idct    = ff_vp3_idct_c;
4195             c->idct_permutation_type= FF_NO_IDCT_PERM;
4196         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4197             c->idct_put= ff_wmv2_idct_put_c;
4198             c->idct_add= ff_wmv2_idct_add_c;
4199             c->idct    = ff_wmv2_idct_c;
4200             c->idct_permutation_type= FF_NO_IDCT_PERM;
4201         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4202             c->idct_put= ff_faanidct_put;
4203             c->idct_add= ff_faanidct_add;
4204             c->idct    = ff_faanidct;
4205             c->idct_permutation_type= FF_NO_IDCT_PERM;
4206         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4207             c->idct_put= ff_ea_idct_put_c;
4208             c->idct_permutation_type= FF_NO_IDCT_PERM;
4209         }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4210             c->idct     = ff_bink_idct_c;
4211             c->idct_add = ff_bink_idct_add_c;
4212             c->idct_put = ff_bink_idct_put_c;
4213             c->idct_permutation_type = FF_NO_IDCT_PERM;
4214         }else{ //accurate/default
4215             c->idct_put= ff_simple_idct_put;
4216             c->idct_add= ff_simple_idct_add;
4217             c->idct    = ff_simple_idct;
4218             c->idct_permutation_type= FF_NO_IDCT_PERM;
4219         }
4220     }
4221
4222     c->get_pixels = get_pixels_c;
4223     c->diff_pixels = diff_pixels_c;
4224     c->put_pixels_clamped = put_pixels_clamped_c;
4225     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4226     c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4227     c->add_pixels_clamped = add_pixels_clamped_c;
4228     c->add_pixels8 = add_pixels8_c;
4229     c->add_pixels4 = add_pixels4_c;
4230     c->sum_abs_dctelem = sum_abs_dctelem_c;
4231     c->gmc1 = gmc1_c;
4232     c->gmc = ff_gmc_c;
4233     c->clear_block = clear_block_c;
4234     c->clear_blocks = clear_blocks_c;
4235     c->pix_sum = pix_sum_c;
4236     c->pix_norm1 = pix_norm1_c;
4237
4238     c->fill_block_tab[0] = fill_block16_c;
4239     c->fill_block_tab[1] = fill_block8_c;
4240     c->scale_block = scale_block_c;
4241
4242     /* TODO [0] 16  [1] 8 */
4243     c->pix_abs[0][0] = pix_abs16_c;
4244     c->pix_abs[0][1] = pix_abs16_x2_c;
4245     c->pix_abs[0][2] = pix_abs16_y2_c;
4246     c->pix_abs[0][3] = pix_abs16_xy2_c;
4247     c->pix_abs[1][0] = pix_abs8_c;
4248     c->pix_abs[1][1] = pix_abs8_x2_c;
4249     c->pix_abs[1][2] = pix_abs8_y2_c;
4250     c->pix_abs[1][3] = pix_abs8_xy2_c;
4251
4252 #define dspfunc(PFX, IDX, NUM) \
4253     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4254     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4255     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4256     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4257
4258     dspfunc(put, 0, 16);
4259     dspfunc(put_no_rnd, 0, 16);
4260     dspfunc(put, 1, 8);
4261     dspfunc(put_no_rnd, 1, 8);
4262     dspfunc(put, 2, 4);
4263     dspfunc(put, 3, 2);
4264
4265     dspfunc(avg, 0, 16);
4266     dspfunc(avg_no_rnd, 0, 16);
4267     dspfunc(avg, 1, 8);
4268     dspfunc(avg_no_rnd, 1, 8);
4269     dspfunc(avg, 2, 4);
4270     dspfunc(avg, 3, 2);
4271 #undef dspfunc
4272
4273     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4274     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4275
4276     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4277     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4278     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4279     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4280     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4281     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4282     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4283     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4284     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4285
4286     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4287     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4288     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4289     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4290     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4291     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4292     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4293     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4294     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4295
4296 #define dspfunc(PFX, IDX, NUM) \
4297     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4298     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4299     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4300     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4301     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4302     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4303     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4304     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4305     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4306     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4307     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4308     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4309     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4310     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4311     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4312     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4313
4314     dspfunc(put_qpel, 0, 16);
4315     dspfunc(put_no_rnd_qpel, 0, 16);
4316
4317     dspfunc(avg_qpel, 0, 16);
4318     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4319
4320     dspfunc(put_qpel, 1, 8);
4321     dspfunc(put_no_rnd_qpel, 1, 8);
4322
4323     dspfunc(avg_qpel, 1, 8);
4324     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4325
4326     dspfunc(put_h264_qpel, 0, 16);
4327     dspfunc(put_h264_qpel, 1, 8);
4328     dspfunc(put_h264_qpel, 2, 4);
4329     dspfunc(put_h264_qpel, 3, 2);
4330     dspfunc(avg_h264_qpel, 0, 16);
4331     dspfunc(avg_h264_qpel, 1, 8);
4332     dspfunc(avg_h264_qpel, 2, 4);
4333
4334 #undef dspfunc
4335     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4336     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4337     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4338     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4339     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4340     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4341     c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4342     c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4343
4344     c->draw_edges = draw_edges_c;
4345
4346 #if CONFIG_CAVS_DECODER
4347     ff_cavsdsp_init(c,avctx);
4348 #endif
4349
4350 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4351     ff_mlp_init(c, avctx);
4352 #endif
4353 #if CONFIG_VC1_DECODER
4354     ff_vc1dsp_init(c,avctx);
4355 #endif
4356 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4357     ff_intrax8dsp_init(c,avctx);
4358 #endif
4359 #if CONFIG_RV30_DECODER
4360     ff_rv30dsp_init(c,avctx);
4361 #endif
4362 #if CONFIG_RV40_DECODER
4363     ff_rv40dsp_init(c,avctx);
4364     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4365     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4366     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4367     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4368 #endif
4369
4370     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4371     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4372     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4373     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4374     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4375     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4376     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4377     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4378
4379 #define SET_CMP_FUNC(name) \
4380     c->name[0]= name ## 16_c;\
4381     c->name[1]= name ## 8x8_c;
4382
4383     SET_CMP_FUNC(hadamard8_diff)
4384     c->hadamard8_diff[4]= hadamard8_intra16_c;
4385     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4386     SET_CMP_FUNC(dct_sad)
4387     SET_CMP_FUNC(dct_max)
4388 #if CONFIG_GPL
4389     SET_CMP_FUNC(dct264_sad)
4390 #endif
4391     c->sad[0]= pix_abs16_c;
4392     c->sad[1]= pix_abs8_c;
4393     c->sse[0]= sse16_c;
4394     c->sse[1]= sse8_c;
4395     c->sse[2]= sse4_c;
4396     SET_CMP_FUNC(quant_psnr)
4397     SET_CMP_FUNC(rd)
4398     SET_CMP_FUNC(bit)
4399     c->vsad[0]= vsad16_c;
4400     c->vsad[4]= vsad_intra16_c;
4401     c->vsad[5]= vsad_intra8_c;
4402     c->vsse[0]= vsse16_c;
4403     c->vsse[4]= vsse_intra16_c;
4404     c->vsse[5]= vsse_intra8_c;
4405     c->nsse[0]= nsse16_c;
4406     c->nsse[1]= nsse8_c;
4407 #if CONFIG_DWT
4408     ff_dsputil_init_dwt(c);
4409 #endif
4410
4411     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4412
4413     c->add_bytes= add_bytes_c;
4414     c->add_bytes_l2= add_bytes_l2_c;
4415     c->diff_bytes= diff_bytes_c;
4416     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4417     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4418     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
4419     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4420     c->bswap_buf= bswap_buf;
4421 #if CONFIG_PNG_DECODER
4422     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4423 #endif
4424
4425     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4426         c->h263_h_loop_filter= h263_h_loop_filter_c;
4427         c->h263_v_loop_filter= h263_v_loop_filter_c;
4428     }
4429
4430     if (CONFIG_VP3_DECODER) {
4431         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4432         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4433         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4434     }
4435     if (CONFIG_VP6_DECODER) {
4436         c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4437     }
4438
4439     c->h261_loop_filter= h261_loop_filter_c;
4440
4441     c->try_8x8basis= try_8x8basis_c;
4442     c->add_8x8basis= add_8x8basis_c;
4443
4444 #if CONFIG_VORBIS_DECODER
4445     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4446 #endif
4447 #if CONFIG_AC3_DECODER
4448     c->ac3_downmix = ff_ac3_downmix_c;
4449 #endif
4450 #if CONFIG_LPC
4451     c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
4452 #endif
4453     c->vector_fmul = vector_fmul_c;
4454     c->vector_fmul_reverse = vector_fmul_reverse_c;
4455     c->vector_fmul_add = vector_fmul_add_c;
4456     c->vector_fmul_window = ff_vector_fmul_window_c;
4457     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4458     c->vector_clipf = vector_clipf_c;
4459     c->float_to_int16 = ff_float_to_int16_c;
4460     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4461     c->scalarproduct_int16 = scalarproduct_int16_c;
4462     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4463     c->scalarproduct_float = scalarproduct_float_c;
4464     c->butterflies_float = butterflies_float_c;
4465     c->vector_fmul_scalar = vector_fmul_scalar_c;
4466
4467     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4468     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4469
4470     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4471     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4472
4473     c->shrink[0]= ff_img_copy_plane;
4474     c->shrink[1]= ff_shrink22;
4475     c->shrink[2]= ff_shrink44;
4476     c->shrink[3]= ff_shrink88;
4477
4478     c->prefetch= just_return;
4479
4480     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4481     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4482
4483     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4484     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4485     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4486     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4487     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4488     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4489     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4490     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4491     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4492
4493     for(i=0; i<64; i++){
4494         if(!c->put_2tap_qpel_pixels_tab[0][i])
4495             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4496         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4497             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4498     }
4499
4500     c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4501     c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4502     c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4503     c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4504
4505     c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4506     c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4507     c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4508     c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4509
4510     switch(c->idct_permutation_type){
4511     case FF_NO_IDCT_PERM:
4512         for(i=0; i<64; i++)
4513             c->idct_permutation[i]= i;
4514         break;
4515     case FF_LIBMPEG2_IDCT_PERM:
4516         for(i=0; i<64; i++)
4517             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4518         break;
4519     case FF_SIMPLE_IDCT_PERM:
4520         for(i=0; i<64; i++)
4521             c->idct_permutation[i]= simple_mmx_permutation[i];
4522         break;
4523     case FF_TRANSPOSE_IDCT_PERM:
4524         for(i=0; i<64; i++)
4525             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4526         break;
4527     case FF_PARTTRANS_IDCT_PERM:
4528         for(i=0; i<64; i++)
4529             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4530         break;
4531     case FF_SSE2_IDCT_PERM:
4532         for(i=0; i<64; i++)
4533             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4534         break;
4535     default:
4536         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4537     }
4538 }
4539