git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavcore/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "lpc.h"
  40 #include "ac3dec.h"
  41 #include "vorbis.h"
  42 #include "png.h"
  43
  44 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  45 uint32_t ff_squareTbl[512] = {0, };
  46
  47 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  48 #define pb_7f (~0UL/255 * 0x7f)
  49 #define pb_80 (~0UL/255 * 0x80)
  50
  51 const uint8_t ff_zigzag_direct[64] = {
  52     0,   1,  8, 16,  9,  2,  3, 10,
  53     17, 24, 32, 25, 18, 11,  4,  5,
  54     12, 19, 26, 33, 40, 48, 41, 34,
  55     27, 20, 13,  6,  7, 14, 21, 28,
  56     35, 42, 49, 56, 57, 50, 43, 36,
  57     29, 22, 15, 23, 30, 37, 44, 51,
  58     58, 59, 52, 45, 38, 31, 39, 46,
  59     53, 60, 61, 54, 47, 55, 62, 63
  60 };
  61
  62 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  63    specification, we interleave the fields */
  64 const uint8_t ff_zigzag248_direct[64] = {
  65      0,  8,  1,  9, 16, 24,  2, 10,
  66     17, 25, 32, 40, 48, 56, 33, 41,
  67     18, 26,  3, 11,  4, 12, 19, 27,
  68     34, 42, 49, 57, 50, 58, 35, 43,
  69     20, 28,  5, 13,  6, 14, 21, 29,
  70     36, 44, 51, 59, 52, 60, 37, 45,
  71     22, 30,  7, 15, 23, 31, 38, 46,
  72     53, 61, 54, 62, 39, 47, 55, 63,
  73 };
  74
  75 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  76 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  77
  78 const uint8_t ff_alternate_horizontal_scan[64] = {
  79     0,  1,   2,  3,  8,  9, 16, 17,
  80     10, 11,  4,  5,  6,  7, 15, 14,
  81     13, 12, 19, 18, 24, 25, 32, 33,
  82     26, 27, 20, 21, 22, 23, 28, 29,
  83     30, 31, 34, 35, 40, 41, 48, 49,
  84     42, 43, 36, 37, 38, 39, 44, 45,
  85     46, 47, 50, 51, 56, 57, 58, 59,
  86     52, 53, 54, 55, 60, 61, 62, 63,
  87 };
  88
  89 const uint8_t ff_alternate_vertical_scan[64] = {
  90     0,  8,  16, 24,  1,  9,  2, 10,
  91     17, 25, 32, 40, 48, 56, 57, 49,
  92     41, 33, 26, 18,  3, 11,  4, 12,
  93     19, 27, 34, 42, 50, 58, 35, 43,
  94     51, 59, 20, 28,  5, 13,  6, 14,
  95     21, 29, 36, 44, 52, 60, 37, 45,
  96     53, 61, 22, 30,  7, 15, 23, 31,
  97     38, 46, 54, 62, 39, 47, 55, 63,
  98 };
  99
 100 /* Input permutation for the simple_idct_mmx */
 101 static const uint8_t simple_mmx_permutation[64]={
 102         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 103         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 104         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 105         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 106         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 107         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 108         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 109         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 110 };
 111
 112 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 113
 114 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 115     int i;
 116     int end;
 117
 118     st->scantable= src_scantable;
 119
 120     for(i=0; i<64; i++){
 121         int j;
 122         j = src_scantable[i];
 123         st->permutated[i] = permutation[j];
 124 #if ARCH_PPC
 125         st->inverse[j] = i;
 126 #endif
 127     }
 128
 129     end=-1;
 130     for(i=0; i<64; i++){
 131         int j;
 132         j = st->permutated[i];
 133         if(j>end) end=j;
 134         st->raster_end[i]= end;
 135     }
 136 }
 137
 138 static int pix_sum_c(uint8_t * pix, int line_size)
 139 {
 140     int s, i, j;
 141
 142     s = 0;
 143     for (i = 0; i < 16; i++) {
 144         for (j = 0; j < 16; j += 8) {
 145             s += pix[0];
 146             s += pix[1];
 147             s += pix[2];
 148             s += pix[3];
 149             s += pix[4];
 150             s += pix[5];
 151             s += pix[6];
 152             s += pix[7];
 153             pix += 8;
 154         }
 155         pix += line_size - 16;
 156     }
 157     return s;
 158 }
 159
 160 static int pix_norm1_c(uint8_t * pix, int line_size)
 161 {
 162     int s, i, j;
 163     uint32_t *sq = ff_squareTbl + 256;
 164
 165     s = 0;
 166     for (i = 0; i < 16; i++) {
 167         for (j = 0; j < 16; j += 8) {
 168 #if 0
 169             s += sq[pix[0]];
 170             s += sq[pix[1]];
 171             s += sq[pix[2]];
 172             s += sq[pix[3]];
 173             s += sq[pix[4]];
 174             s += sq[pix[5]];
 175             s += sq[pix[6]];
 176             s += sq[pix[7]];
 177 #else
 178 #if LONG_MAX > 2147483647
 179             register uint64_t x=*(uint64_t*)pix;
 180             s += sq[x&0xff];
 181             s += sq[(x>>8)&0xff];
 182             s += sq[(x>>16)&0xff];
 183             s += sq[(x>>24)&0xff];
 184             s += sq[(x>>32)&0xff];
 185             s += sq[(x>>40)&0xff];
 186             s += sq[(x>>48)&0xff];
 187             s += sq[(x>>56)&0xff];
 188 #else
 189             register uint32_t x=*(uint32_t*)pix;
 190             s += sq[x&0xff];
 191             s += sq[(x>>8)&0xff];
 192             s += sq[(x>>16)&0xff];
 193             s += sq[(x>>24)&0xff];
 194             x=*(uint32_t*)(pix+4);
 195             s += sq[x&0xff];
 196             s += sq[(x>>8)&0xff];
 197             s += sq[(x>>16)&0xff];
 198             s += sq[(x>>24)&0xff];
 199 #endif
 200 #endif
 201             pix += 8;
 202         }
 203         pix += line_size - 16;
 204     }
 205     return s;
 206 }
 207
 208 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 209     int i;
 210
 211     for(i=0; i+8<=w; i+=8){
 212         dst[i+0]= av_bswap32(src[i+0]);
 213         dst[i+1]= av_bswap32(src[i+1]);
 214         dst[i+2]= av_bswap32(src[i+2]);
 215         dst[i+3]= av_bswap32(src[i+3]);
 216         dst[i+4]= av_bswap32(src[i+4]);
 217         dst[i+5]= av_bswap32(src[i+5]);
 218         dst[i+6]= av_bswap32(src[i+6]);
 219         dst[i+7]= av_bswap32(src[i+7]);
 220     }
 221     for(;i<w; i++){
 222         dst[i+0]= av_bswap32(src[i+0]);
 223     }
 224 }
 225
 226 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 227 {
 228     int s, i;
 229     uint32_t *sq = ff_squareTbl + 256;
 230
 231     s = 0;
 232     for (i = 0; i < h; i++) {
 233         s += sq[pix1[0] - pix2[0]];
 234         s += sq[pix1[1] - pix2[1]];
 235         s += sq[pix1[2] - pix2[2]];
 236         s += sq[pix1[3] - pix2[3]];
 237         pix1 += line_size;
 238         pix2 += line_size;
 239     }
 240     return s;
 241 }
 242
 243 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 244 {
 245     int s, i;
 246     uint32_t *sq = ff_squareTbl + 256;
 247
 248     s = 0;
 249     for (i = 0; i < h; i++) {
 250         s += sq[pix1[0] - pix2[0]];
 251         s += sq[pix1[1] - pix2[1]];
 252         s += sq[pix1[2] - pix2[2]];
 253         s += sq[pix1[3] - pix2[3]];
 254         s += sq[pix1[4] - pix2[4]];
 255         s += sq[pix1[5] - pix2[5]];
 256         s += sq[pix1[6] - pix2[6]];
 257         s += sq[pix1[7] - pix2[7]];
 258         pix1 += line_size;
 259         pix2 += line_size;
 260     }
 261     return s;
 262 }
 263
 264 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 265 {
 266     int s, i;
 267     uint32_t *sq = ff_squareTbl + 256;
 268
 269     s = 0;
 270     for (i = 0; i < h; i++) {
 271         s += sq[pix1[ 0] - pix2[ 0]];
 272         s += sq[pix1[ 1] - pix2[ 1]];
 273         s += sq[pix1[ 2] - pix2[ 2]];
 274         s += sq[pix1[ 3] - pix2[ 3]];
 275         s += sq[pix1[ 4] - pix2[ 4]];
 276         s += sq[pix1[ 5] - pix2[ 5]];
 277         s += sq[pix1[ 6] - pix2[ 6]];
 278         s += sq[pix1[ 7] - pix2[ 7]];
 279         s += sq[pix1[ 8] - pix2[ 8]];
 280         s += sq[pix1[ 9] - pix2[ 9]];
 281         s += sq[pix1[10] - pix2[10]];
 282         s += sq[pix1[11] - pix2[11]];
 283         s += sq[pix1[12] - pix2[12]];
 284         s += sq[pix1[13] - pix2[13]];
 285         s += sq[pix1[14] - pix2[14]];
 286         s += sq[pix1[15] - pix2[15]];
 287
 288         pix1 += line_size;
 289         pix2 += line_size;
 290     }
 291     return s;
 292 }
 293
 294 /* draw the edges of width 'w' of an image of size width, height */
 295 //FIXME check that this is ok for mpeg4 interlaced
 296 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 297 {
 298     uint8_t *ptr, *last_line;
 299     int i;
 300
 301     last_line = buf + (height - 1) * wrap;
 302     for(i=0;i<w;i++) {
 303         /* top and bottom */
 304         memcpy(buf - (i + 1) * wrap, buf, width);
 305         memcpy(last_line + (i + 1) * wrap, last_line, width);
 306     }
 307     /* left and right */
 308     ptr = buf;
 309     for(i=0;i<height;i++) {
 310         memset(ptr - w, ptr[0], w);
 311         memset(ptr + width, ptr[width-1], w);
 312         ptr += wrap;
 313     }
 314     /* corners */
 315     for(i=0;i<w;i++) {
 316         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 317         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 318         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 319         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 320     }
 321 }
 322
 323 /**
 324  * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
 325  * @param buf destination buffer
 326  * @param src source buffer
 327  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 328  * @param block_w width of block
 329  * @param block_h height of block
 330  * @param src_x x coordinate of the top left sample of the block in the source buffer
 331  * @param src_y y coordinate of the top left sample of the block in the source buffer
 332  * @param w width of the source buffer
 333  * @param h height of the source buffer
 334  */
 335 void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
 336                                     int src_x, int src_y, int w, int h){
 337     int x, y;
 338     int start_y, start_x, end_y, end_x;
 339
 340     if(src_y>= h){
 341         src+= (h-1-src_y)*linesize;
 342         src_y=h-1;
 343     }else if(src_y<=-block_h){
 344         src+= (1-block_h-src_y)*linesize;
 345         src_y=1-block_h;
 346     }
 347     if(src_x>= w){
 348         src+= (w-1-src_x);
 349         src_x=w-1;
 350     }else if(src_x<=-block_w){
 351         src+= (1-block_w-src_x);
 352         src_x=1-block_w;
 353     }
 354
 355     start_y= FFMAX(0, -src_y);
 356     start_x= FFMAX(0, -src_x);
 357     end_y= FFMIN(block_h, h-src_y);
 358     end_x= FFMIN(block_w, w-src_x);
 359
 360     // copy existing part
 361     for(y=start_y; y<end_y; y++){
 362         for(x=start_x; x<end_x; x++){
 363             buf[x + y*linesize]= src[x + y*linesize];
 364         }
 365     }
 366
 367     //top
 368     for(y=0; y<start_y; y++){
 369         for(x=start_x; x<end_x; x++){
 370             buf[x + y*linesize]= buf[x + start_y*linesize];
 371         }
 372     }
 373
 374     //bottom
 375     for(y=end_y; y<block_h; y++){
 376         for(x=start_x; x<end_x; x++){
 377             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
 378         }
 379     }
 380
 381     for(y=0; y<block_h; y++){
 382        //left
 383         for(x=0; x<start_x; x++){
 384             buf[x + y*linesize]= buf[start_x + y*linesize];
 385         }
 386
 387        //right
 388         for(x=end_x; x<block_w; x++){
 389             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
 390         }
 391     }
 392 }
 393
 394 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 395 {
 396     int i;
 397
 398     /* read the pixels */
 399     for(i=0;i<8;i++) {
 400         block[0] = pixels[0];
 401         block[1] = pixels[1];
 402         block[2] = pixels[2];
 403         block[3] = pixels[3];
 404         block[4] = pixels[4];
 405         block[5] = pixels[5];
 406         block[6] = pixels[6];
 407         block[7] = pixels[7];
 408         pixels += line_size;
 409         block += 8;
 410     }
 411 }
 412
 413 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 414                           const uint8_t *s2, int stride){
 415     int i;
 416
 417     /* read the pixels */
 418     for(i=0;i<8;i++) {
 419         block[0] = s1[0] - s2[0];
 420         block[1] = s1[1] - s2[1];
 421         block[2] = s1[2] - s2[2];
 422         block[3] = s1[3] - s2[3];
 423         block[4] = s1[4] - s2[4];
 424         block[5] = s1[5] - s2[5];
 425         block[6] = s1[6] - s2[6];
 426         block[7] = s1[7] - s2[7];
 427         s1 += stride;
 428         s2 += stride;
 429         block += 8;
 430     }
 431 }
 432
 433
 434 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 435                                  int line_size)
 436 {
 437     int i;
 438     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 439
 440     /* read the pixels */
 441     for(i=0;i<8;i++) {
 442         pixels[0] = cm[block[0]];
 443         pixels[1] = cm[block[1]];
 444         pixels[2] = cm[block[2]];
 445         pixels[3] = cm[block[3]];
 446         pixels[4] = cm[block[4]];
 447         pixels[5] = cm[block[5]];
 448         pixels[6] = cm[block[6]];
 449         pixels[7] = cm[block[7]];
 450
 451         pixels += line_size;
 452         block += 8;
 453     }
 454 }
 455
 456 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 457                                  int line_size)
 458 {
 459     int i;
 460     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 461
 462     /* read the pixels */
 463     for(i=0;i<4;i++) {
 464         pixels[0] = cm[block[0]];
 465         pixels[1] = cm[block[1]];
 466         pixels[2] = cm[block[2]];
 467         pixels[3] = cm[block[3]];
 468
 469         pixels += line_size;
 470         block += 8;
 471     }
 472 }
 473
 474 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 475                                  int line_size)
 476 {
 477     int i;
 478     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 479
 480     /* read the pixels */
 481     for(i=0;i<2;i++) {
 482         pixels[0] = cm[block[0]];
 483         pixels[1] = cm[block[1]];
 484
 485         pixels += line_size;
 486         block += 8;
 487     }
 488 }
 489
 490 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 491                                         uint8_t *restrict pixels,
 492                                         int line_size)
 493 {
 494     int i, j;
 495
 496     for (i = 0; i < 8; i++) {
 497         for (j = 0; j < 8; j++) {
 498             if (*block < -128)
 499                 *pixels = 0;
 500             else if (*block > 127)
 501                 *pixels = 255;
 502             else
 503                 *pixels = (uint8_t)(*block + 128);
 504             block++;
 505             pixels++;
 506         }
 507         pixels += (line_size - 8);
 508     }
 509 }
 510
 511 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 512                                     int line_size)
 513 {
 514     int i;
 515
 516     /* read the pixels */
 517     for(i=0;i<8;i++) {
 518         pixels[0] = block[0];
 519         pixels[1] = block[1];
 520         pixels[2] = block[2];
 521         pixels[3] = block[3];
 522         pixels[4] = block[4];
 523         pixels[5] = block[5];
 524         pixels[6] = block[6];
 525         pixels[7] = block[7];
 526
 527         pixels += line_size;
 528         block += 8;
 529     }
 530 }
 531
 532 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 533                           int line_size)
 534 {
 535     int i;
 536     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 537
 538     /* read the pixels */
 539     for(i=0;i<8;i++) {
 540         pixels[0] = cm[pixels[0] + block[0]];
 541         pixels[1] = cm[pixels[1] + block[1]];
 542         pixels[2] = cm[pixels[2] + block[2]];
 543         pixels[3] = cm[pixels[3] + block[3]];
 544         pixels[4] = cm[pixels[4] + block[4]];
 545         pixels[5] = cm[pixels[5] + block[5]];
 546         pixels[6] = cm[pixels[6] + block[6]];
 547         pixels[7] = cm[pixels[7] + block[7]];
 548         pixels += line_size;
 549         block += 8;
 550     }
 551 }
 552
 553 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 554                           int line_size)
 555 {
 556     int i;
 557     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 558
 559     /* read the pixels */
 560     for(i=0;i<4;i++) {
 561         pixels[0] = cm[pixels[0] + block[0]];
 562         pixels[1] = cm[pixels[1] + block[1]];
 563         pixels[2] = cm[pixels[2] + block[2]];
 564         pixels[3] = cm[pixels[3] + block[3]];
 565         pixels += line_size;
 566         block += 8;
 567     }
 568 }
 569
 570 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 571                           int line_size)
 572 {
 573     int i;
 574     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 575
 576     /* read the pixels */
 577     for(i=0;i<2;i++) {
 578         pixels[0] = cm[pixels[0] + block[0]];
 579         pixels[1] = cm[pixels[1] + block[1]];
 580         pixels += line_size;
 581         block += 8;
 582     }
 583 }
 584
 585 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 586 {
 587     int i;
 588     for(i=0;i<8;i++) {
 589         pixels[0] += block[0];
 590         pixels[1] += block[1];
 591         pixels[2] += block[2];
 592         pixels[3] += block[3];
 593         pixels[4] += block[4];
 594         pixels[5] += block[5];
 595         pixels[6] += block[6];
 596         pixels[7] += block[7];
 597         pixels += line_size;
 598         block += 8;
 599     }
 600 }
 601
 602 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 603 {
 604     int i;
 605     for(i=0;i<4;i++) {
 606         pixels[0] += block[0];
 607         pixels[1] += block[1];
 608         pixels[2] += block[2];
 609         pixels[3] += block[3];
 610         pixels += line_size;
 611         block += 4;
 612     }
 613 }
 614
 615 static int sum_abs_dctelem_c(DCTELEM *block)
 616 {
 617     int sum=0, i;
 618     for(i=0; i<64; i++)
 619         sum+= FFABS(block[i]);
 620     return sum;
 621 }
 622
 623 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 624 {
 625     int i;
 626
 627     for (i = 0; i < h; i++) {
 628         memset(block, value, 16);
 629         block += line_size;
 630     }
 631 }
 632
 633 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 634 {
 635     int i;
 636
 637     for (i = 0; i < h; i++) {
 638         memset(block, value, 8);
 639         block += line_size;
 640     }
 641 }
 642
 643 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
 644 {
 645     int i, j;
 646     uint16_t *dst1 = (uint16_t *) dst;
 647     uint16_t *dst2 = (uint16_t *)(dst + linesize);
 648
 649     for (j = 0; j < 8; j++) {
 650         for (i = 0; i < 8; i++) {
 651             dst1[i] = dst2[i] = src[i] * 0x0101;
 652         }
 653         src  += 8;
 654         dst1 += linesize;
 655         dst2 += linesize;
 656     }
 657 }
 658
 659 #if 0
 660
 661 #define PIXOP2(OPNAME, OP) \
 662 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 663 {\
 664     int i;\
 665     for(i=0; i<h; i++){\
 666         OP(*((uint64_t*)block), AV_RN64(pixels));\
 667         pixels+=line_size;\
 668         block +=line_size;\
 669     }\
 670 }\
 671 \
 672 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 673 {\
 674     int i;\
 675     for(i=0; i<h; i++){\
 676         const uint64_t a= AV_RN64(pixels  );\
 677         const uint64_t b= AV_RN64(pixels+1);\
 678         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 679         pixels+=line_size;\
 680         block +=line_size;\
 681     }\
 682 }\
 683 \
 684 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 685 {\
 686     int i;\
 687     for(i=0; i<h; i++){\
 688         const uint64_t a= AV_RN64(pixels  );\
 689         const uint64_t b= AV_RN64(pixels+1);\
 690         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 691         pixels+=line_size;\
 692         block +=line_size;\
 693     }\
 694 }\
 695 \
 696 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 697 {\
 698     int i;\
 699     for(i=0; i<h; i++){\
 700         const uint64_t a= AV_RN64(pixels          );\
 701         const uint64_t b= AV_RN64(pixels+line_size);\
 702         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 703         pixels+=line_size;\
 704         block +=line_size;\
 705     }\
 706 }\
 707 \
 708 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 709 {\
 710     int i;\
 711     for(i=0; i<h; i++){\
 712         const uint64_t a= AV_RN64(pixels          );\
 713         const uint64_t b= AV_RN64(pixels+line_size);\
 714         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 715         pixels+=line_size;\
 716         block +=line_size;\
 717     }\
 718 }\
 719 \
 720 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 721 {\
 722         int i;\
 723         const uint64_t a= AV_RN64(pixels  );\
 724         const uint64_t b= AV_RN64(pixels+1);\
 725         uint64_t l0=  (a&0x0303030303030303ULL)\
 726                     + (b&0x0303030303030303ULL)\
 727                     + 0x0202020202020202ULL;\
 728         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 729                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 730         uint64_t l1,h1;\
 731 \
 732         pixels+=line_size;\
 733         for(i=0; i<h; i+=2){\
 734             uint64_t a= AV_RN64(pixels  );\
 735             uint64_t b= AV_RN64(pixels+1);\
 736             l1=  (a&0x0303030303030303ULL)\
 737                + (b&0x0303030303030303ULL);\
 738             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 739               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 740             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 741             pixels+=line_size;\
 742             block +=line_size;\
 743             a= AV_RN64(pixels  );\
 744             b= AV_RN64(pixels+1);\
 745             l0=  (a&0x0303030303030303ULL)\
 746                + (b&0x0303030303030303ULL)\
 747                + 0x0202020202020202ULL;\
 748             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 749               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 750             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 751             pixels+=line_size;\
 752             block +=line_size;\
 753         }\
 754 }\
 755 \
 756 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 757 {\
 758         int i;\
 759         const uint64_t a= AV_RN64(pixels  );\
 760         const uint64_t b= AV_RN64(pixels+1);\
 761         uint64_t l0=  (a&0x0303030303030303ULL)\
 762                     + (b&0x0303030303030303ULL)\
 763                     + 0x0101010101010101ULL;\
 764         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 765                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 766         uint64_t l1,h1;\
 767 \
 768         pixels+=line_size;\
 769         for(i=0; i<h; i+=2){\
 770             uint64_t a= AV_RN64(pixels  );\
 771             uint64_t b= AV_RN64(pixels+1);\
 772             l1=  (a&0x0303030303030303ULL)\
 773                + (b&0x0303030303030303ULL);\
 774             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 775               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 776             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 777             pixels+=line_size;\
 778             block +=line_size;\
 779             a= AV_RN64(pixels  );\
 780             b= AV_RN64(pixels+1);\
 781             l0=  (a&0x0303030303030303ULL)\
 782                + (b&0x0303030303030303ULL)\
 783                + 0x0101010101010101ULL;\
 784             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 785               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 786             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 787             pixels+=line_size;\
 788             block +=line_size;\
 789         }\
 790 }\
 791 \
 792 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 793 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 794 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 795 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 796 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 797 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 798 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 799
 800 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 801 #else // 64 bit variant
 802
 803 #define PIXOP2(OPNAME, OP) \
 804 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 805     int i;\
 806     for(i=0; i<h; i++){\
 807         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 808         pixels+=line_size;\
 809         block +=line_size;\
 810     }\
 811 }\
 812 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 813     int i;\
 814     for(i=0; i<h; i++){\
 815         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 816         pixels+=line_size;\
 817         block +=line_size;\
 818     }\
 819 }\
 820 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 821     int i;\
 822     for(i=0; i<h; i++){\
 823         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 824         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 825         pixels+=line_size;\
 826         block +=line_size;\
 827     }\
 828 }\
 829 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 830     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 831 }\
 832 \
 833 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 834                                                 int src_stride1, int src_stride2, int h){\
 835     int i;\
 836     for(i=0; i<h; i++){\
 837         uint32_t a,b;\
 838         a= AV_RN32(&src1[i*src_stride1  ]);\
 839         b= AV_RN32(&src2[i*src_stride2  ]);\
 840         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 841         a= AV_RN32(&src1[i*src_stride1+4]);\
 842         b= AV_RN32(&src2[i*src_stride2+4]);\
 843         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 844     }\
 845 }\
 846 \
 847 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 848                                                 int src_stride1, int src_stride2, int h){\
 849     int i;\
 850     for(i=0; i<h; i++){\
 851         uint32_t a,b;\
 852         a= AV_RN32(&src1[i*src_stride1  ]);\
 853         b= AV_RN32(&src2[i*src_stride2  ]);\
 854         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 855         a= AV_RN32(&src1[i*src_stride1+4]);\
 856         b= AV_RN32(&src2[i*src_stride2+4]);\
 857         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 858     }\
 859 }\
 860 \
 861 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 862                                                 int src_stride1, int src_stride2, int h){\
 863     int i;\
 864     for(i=0; i<h; i++){\
 865         uint32_t a,b;\
 866         a= AV_RN32(&src1[i*src_stride1  ]);\
 867         b= AV_RN32(&src2[i*src_stride2  ]);\
 868         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 869     }\
 870 }\
 871 \
 872 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 873                                                 int src_stride1, int src_stride2, int h){\
 874     int i;\
 875     for(i=0; i<h; i++){\
 876         uint32_t a,b;\
 877         a= AV_RN16(&src1[i*src_stride1  ]);\
 878         b= AV_RN16(&src2[i*src_stride2  ]);\
 879         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 880     }\
 881 }\
 882 \
 883 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 884                                                 int src_stride1, int src_stride2, int h){\
 885     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 886     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 887 }\
 888 \
 889 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 890                                                 int src_stride1, int src_stride2, int h){\
 891     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 892     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 893 }\
 894 \
 895 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 896     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 897 }\
 898 \
 899 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 900     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 901 }\
 902 \
 903 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 904     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 905 }\
 906 \
 907 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 908     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 909 }\
 910 \
 911 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 912                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 913     int i;\
 914     for(i=0; i<h; i++){\
 915         uint32_t a, b, c, d, l0, l1, h0, h1;\
 916         a= AV_RN32(&src1[i*src_stride1]);\
 917         b= AV_RN32(&src2[i*src_stride2]);\
 918         c= AV_RN32(&src3[i*src_stride3]);\
 919         d= AV_RN32(&src4[i*src_stride4]);\
 920         l0=  (a&0x03030303UL)\
 921            + (b&0x03030303UL)\
 922            + 0x02020202UL;\
 923         h0= ((a&0xFCFCFCFCUL)>>2)\
 924           + ((b&0xFCFCFCFCUL)>>2);\
 925         l1=  (c&0x03030303UL)\
 926            + (d&0x03030303UL);\
 927         h1= ((c&0xFCFCFCFCUL)>>2)\
 928           + ((d&0xFCFCFCFCUL)>>2);\
 929         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 930         a= AV_RN32(&src1[i*src_stride1+4]);\
 931         b= AV_RN32(&src2[i*src_stride2+4]);\
 932         c= AV_RN32(&src3[i*src_stride3+4]);\
 933         d= AV_RN32(&src4[i*src_stride4+4]);\
 934         l0=  (a&0x03030303UL)\
 935            + (b&0x03030303UL)\
 936            + 0x02020202UL;\
 937         h0= ((a&0xFCFCFCFCUL)>>2)\
 938           + ((b&0xFCFCFCFCUL)>>2);\
 939         l1=  (c&0x03030303UL)\
 940            + (d&0x03030303UL);\
 941         h1= ((c&0xFCFCFCFCUL)>>2)\
 942           + ((d&0xFCFCFCFCUL)>>2);\
 943         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 944     }\
 945 }\
 946 \
 947 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 948     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 949 }\
 950 \
 951 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 952     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 953 }\
 954 \
 955 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 956     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 957 }\
 958 \
 959 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 960     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 961 }\
 962 \
 963 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 964                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 965     int i;\
 966     for(i=0; i<h; i++){\
 967         uint32_t a, b, c, d, l0, l1, h0, h1;\
 968         a= AV_RN32(&src1[i*src_stride1]);\
 969         b= AV_RN32(&src2[i*src_stride2]);\
 970         c= AV_RN32(&src3[i*src_stride3]);\
 971         d= AV_RN32(&src4[i*src_stride4]);\
 972         l0=  (a&0x03030303UL)\
 973            + (b&0x03030303UL)\
 974            + 0x01010101UL;\
 975         h0= ((a&0xFCFCFCFCUL)>>2)\
 976           + ((b&0xFCFCFCFCUL)>>2);\
 977         l1=  (c&0x03030303UL)\
 978            + (d&0x03030303UL);\
 979         h1= ((c&0xFCFCFCFCUL)>>2)\
 980           + ((d&0xFCFCFCFCUL)>>2);\
 981         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 982         a= AV_RN32(&src1[i*src_stride1+4]);\
 983         b= AV_RN32(&src2[i*src_stride2+4]);\
 984         c= AV_RN32(&src3[i*src_stride3+4]);\
 985         d= AV_RN32(&src4[i*src_stride4+4]);\
 986         l0=  (a&0x03030303UL)\
 987            + (b&0x03030303UL)\
 988            + 0x01010101UL;\
 989         h0= ((a&0xFCFCFCFCUL)>>2)\
 990           + ((b&0xFCFCFCFCUL)>>2);\
 991         l1=  (c&0x03030303UL)\
 992            + (d&0x03030303UL);\
 993         h1= ((c&0xFCFCFCFCUL)>>2)\
 994           + ((d&0xFCFCFCFCUL)>>2);\
 995         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 996     }\
 997 }\
 998 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 999                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1000     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1001     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1002 }\
1003 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1004                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1005     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1006     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1007 }\
1008 \
1009 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1010 {\
1011         int i, a0, b0, a1, b1;\
1012         a0= pixels[0];\
1013         b0= pixels[1] + 2;\
1014         a0 += b0;\
1015         b0 += pixels[2];\
1016 \
1017         pixels+=line_size;\
1018         for(i=0; i<h; i+=2){\
1019             a1= pixels[0];\
1020             b1= pixels[1];\
1021             a1 += b1;\
1022             b1 += pixels[2];\
1023 \
1024             block[0]= (a1+a0)>>2; /* FIXME non put */\
1025             block[1]= (b1+b0)>>2;\
1026 \
1027             pixels+=line_size;\
1028             block +=line_size;\
1029 \
1030             a0= pixels[0];\
1031             b0= pixels[1] + 2;\
1032             a0 += b0;\
1033             b0 += pixels[2];\
1034 \
1035             block[0]= (a1+a0)>>2;\
1036             block[1]= (b1+b0)>>2;\
1037             pixels+=line_size;\
1038             block +=line_size;\
1039         }\
1040 }\
1041 \
1042 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1043 {\
1044         int i;\
1045         const uint32_t a= AV_RN32(pixels  );\
1046         const uint32_t b= AV_RN32(pixels+1);\
1047         uint32_t l0=  (a&0x03030303UL)\
1048                     + (b&0x03030303UL)\
1049                     + 0x02020202UL;\
1050         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1051                    + ((b&0xFCFCFCFCUL)>>2);\
1052         uint32_t l1,h1;\
1053 \
1054         pixels+=line_size;\
1055         for(i=0; i<h; i+=2){\
1056             uint32_t a= AV_RN32(pixels  );\
1057             uint32_t b= AV_RN32(pixels+1);\
1058             l1=  (a&0x03030303UL)\
1059                + (b&0x03030303UL);\
1060             h1= ((a&0xFCFCFCFCUL)>>2)\
1061               + ((b&0xFCFCFCFCUL)>>2);\
1062             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1063             pixels+=line_size;\
1064             block +=line_size;\
1065             a= AV_RN32(pixels  );\
1066             b= AV_RN32(pixels+1);\
1067             l0=  (a&0x03030303UL)\
1068                + (b&0x03030303UL)\
1069                + 0x02020202UL;\
1070             h0= ((a&0xFCFCFCFCUL)>>2)\
1071               + ((b&0xFCFCFCFCUL)>>2);\
1072             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1073             pixels+=line_size;\
1074             block +=line_size;\
1075         }\
1076 }\
1077 \
1078 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1079 {\
1080     int j;\
1081     for(j=0; j<2; j++){\
1082         int i;\
1083         const uint32_t a= AV_RN32(pixels  );\
1084         const uint32_t b= AV_RN32(pixels+1);\
1085         uint32_t l0=  (a&0x03030303UL)\
1086                     + (b&0x03030303UL)\
1087                     + 0x02020202UL;\
1088         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1089                    + ((b&0xFCFCFCFCUL)>>2);\
1090         uint32_t l1,h1;\
1091 \
1092         pixels+=line_size;\
1093         for(i=0; i<h; i+=2){\
1094             uint32_t a= AV_RN32(pixels  );\
1095             uint32_t b= AV_RN32(pixels+1);\
1096             l1=  (a&0x03030303UL)\
1097                + (b&0x03030303UL);\
1098             h1= ((a&0xFCFCFCFCUL)>>2)\
1099               + ((b&0xFCFCFCFCUL)>>2);\
1100             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1101             pixels+=line_size;\
1102             block +=line_size;\
1103             a= AV_RN32(pixels  );\
1104             b= AV_RN32(pixels+1);\
1105             l0=  (a&0x03030303UL)\
1106                + (b&0x03030303UL)\
1107                + 0x02020202UL;\
1108             h0= ((a&0xFCFCFCFCUL)>>2)\
1109               + ((b&0xFCFCFCFCUL)>>2);\
1110             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1111             pixels+=line_size;\
1112             block +=line_size;\
1113         }\
1114         pixels+=4-line_size*(h+1);\
1115         block +=4-line_size*h;\
1116     }\
1117 }\
1118 \
1119 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1120 {\
1121     int j;\
1122     for(j=0; j<2; j++){\
1123         int i;\
1124         const uint32_t a= AV_RN32(pixels  );\
1125         const uint32_t b= AV_RN32(pixels+1);\
1126         uint32_t l0=  (a&0x03030303UL)\
1127                     + (b&0x03030303UL)\
1128                     + 0x01010101UL;\
1129         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1130                    + ((b&0xFCFCFCFCUL)>>2);\
1131         uint32_t l1,h1;\
1132 \
1133         pixels+=line_size;\
1134         for(i=0; i<h; i+=2){\
1135             uint32_t a= AV_RN32(pixels  );\
1136             uint32_t b= AV_RN32(pixels+1);\
1137             l1=  (a&0x03030303UL)\
1138                + (b&0x03030303UL);\
1139             h1= ((a&0xFCFCFCFCUL)>>2)\
1140               + ((b&0xFCFCFCFCUL)>>2);\
1141             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1142             pixels+=line_size;\
1143             block +=line_size;\
1144             a= AV_RN32(pixels  );\
1145             b= AV_RN32(pixels+1);\
1146             l0=  (a&0x03030303UL)\
1147                + (b&0x03030303UL)\
1148                + 0x01010101UL;\
1149             h0= ((a&0xFCFCFCFCUL)>>2)\
1150               + ((b&0xFCFCFCFCUL)>>2);\
1151             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1152             pixels+=line_size;\
1153             block +=line_size;\
1154         }\
1155         pixels+=4-line_size*(h+1);\
1156         block +=4-line_size*h;\
1157     }\
1158 }\
1159 \
1160 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1161 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1162 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1163 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1164 av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1165 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1166 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1167 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1168
1169 #define op_avg(a, b) a = rnd_avg32(a, b)
1170 #endif
1171 #define op_put(a, b) a = b
1172
1173 PIXOP2(avg, op_avg)
1174 PIXOP2(put, op_put)
1175 #undef op_avg
1176 #undef op_put
1177
1178 #define put_no_rnd_pixels8_c  put_pixels8_c
1179 #define put_no_rnd_pixels16_c put_pixels16_c
1180
1181 #define avg2(a,b) ((a+b+1)>>1)
1182 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1183
1184 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1185     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1186 }
1187
1188 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1189     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1190 }
1191
1192 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1193 {
1194     const int A=(16-x16)*(16-y16);
1195     const int B=(   x16)*(16-y16);
1196     const int C=(16-x16)*(   y16);
1197     const int D=(   x16)*(   y16);
1198     int i;
1199
1200     for(i=0; i<h; i++)
1201     {
1202         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1203         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1204         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1205         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1206         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1207         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1208         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1209         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1210         dst+= stride;
1211         src+= stride;
1212     }
1213 }
1214
1215 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1216                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1217 {
1218     int y, vx, vy;
1219     const int s= 1<<shift;
1220
1221     width--;
1222     height--;
1223
1224     for(y=0; y<h; y++){
1225         int x;
1226
1227         vx= ox;
1228         vy= oy;
1229         for(x=0; x<8; x++){ //XXX FIXME optimize
1230             int src_x, src_y, frac_x, frac_y, index;
1231
1232             src_x= vx>>16;
1233             src_y= vy>>16;
1234             frac_x= src_x&(s-1);
1235             frac_y= src_y&(s-1);
1236             src_x>>=shift;
1237             src_y>>=shift;
1238
1239             if((unsigned)src_x < width){
1240                 if((unsigned)src_y < height){
1241                     index= src_x + src_y*stride;
1242                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1243                                            + src[index       +1]*   frac_x )*(s-frac_y)
1244                                         + (  src[index+stride  ]*(s-frac_x)
1245                                            + src[index+stride+1]*   frac_x )*   frac_y
1246                                         + r)>>(shift*2);
1247                 }else{
1248                     index= src_x + av_clip(src_y, 0, height)*stride;
1249                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1250                                           + src[index       +1]*   frac_x )*s
1251                                         + r)>>(shift*2);
1252                 }
1253             }else{
1254                 if((unsigned)src_y < height){
1255                     index= av_clip(src_x, 0, width) + src_y*stride;
1256                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1257                                            + src[index+stride  ]*   frac_y )*s
1258                                         + r)>>(shift*2);
1259                 }else{
1260                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1261                     dst[y*stride + x]=    src[index         ];
1262                 }
1263             }
1264
1265             vx+= dxx;
1266             vy+= dyx;
1267         }
1268         ox += dxy;
1269         oy += dyy;
1270     }
1271 }
1272
1273 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1274     switch(width){
1275     case 2: put_pixels2_c (dst, src, stride, height); break;
1276     case 4: put_pixels4_c (dst, src, stride, height); break;
1277     case 8: put_pixels8_c (dst, src, stride, height); break;
1278     case 16:put_pixels16_c(dst, src, stride, height); break;
1279     }
1280 }
1281
1282 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1283     int i,j;
1284     for (i=0; i < height; i++) {
1285       for (j=0; j < width; j++) {
1286         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1287       }
1288       src += stride;
1289       dst += stride;
1290     }
1291 }
1292
1293 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1294     int i,j;
1295     for (i=0; i < height; i++) {
1296       for (j=0; j < width; j++) {
1297         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1298       }
1299       src += stride;
1300       dst += stride;
1301     }
1302 }
1303
1304 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1305     int i,j;
1306     for (i=0; i < height; i++) {
1307       for (j=0; j < width; j++) {
1308         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1309       }
1310       src += stride;
1311       dst += stride;
1312     }
1313 }
1314
1315 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1316     int i,j;
1317     for (i=0; i < height; i++) {
1318       for (j=0; j < width; j++) {
1319         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1320       }
1321       src += stride;
1322       dst += stride;
1323     }
1324 }
1325
1326 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1327     int i,j;
1328     for (i=0; i < height; i++) {
1329       for (j=0; j < width; j++) {
1330         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1331       }
1332       src += stride;
1333       dst += stride;
1334     }
1335 }
1336
1337 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1338     int i,j;
1339     for (i=0; i < height; i++) {
1340       for (j=0; j < width; j++) {
1341         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1342       }
1343       src += stride;
1344       dst += stride;
1345     }
1346 }
1347
1348 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1349     int i,j;
1350     for (i=0; i < height; i++) {
1351       for (j=0; j < width; j++) {
1352         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1353       }
1354       src += stride;
1355       dst += stride;
1356     }
1357 }
1358
1359 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1360     int i,j;
1361     for (i=0; i < height; i++) {
1362       for (j=0; j < width; j++) {
1363         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1364       }
1365       src += stride;
1366       dst += stride;
1367     }
1368 }
1369
1370 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1371     switch(width){
1372     case 2: avg_pixels2_c (dst, src, stride, height); break;
1373     case 4: avg_pixels4_c (dst, src, stride, height); break;
1374     case 8: avg_pixels8_c (dst, src, stride, height); break;
1375     case 16:avg_pixels16_c(dst, src, stride, height); break;
1376     }
1377 }
1378
1379 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1380     int i,j;
1381     for (i=0; i < height; i++) {
1382       for (j=0; j < width; j++) {
1383         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1384       }
1385       src += stride;
1386       dst += stride;
1387     }
1388 }
1389
1390 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1391     int i,j;
1392     for (i=0; i < height; i++) {
1393       for (j=0; j < width; j++) {
1394         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1395       }
1396       src += stride;
1397       dst += stride;
1398     }
1399 }
1400
1401 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1402     int i,j;
1403     for (i=0; i < height; i++) {
1404       for (j=0; j < width; j++) {
1405         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1406       }
1407       src += stride;
1408       dst += stride;
1409     }
1410 }
1411
1412 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1413     int i,j;
1414     for (i=0; i < height; i++) {
1415       for (j=0; j < width; j++) {
1416         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1417       }
1418       src += stride;
1419       dst += stride;
1420     }
1421 }
1422
1423 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1424     int i,j;
1425     for (i=0; i < height; i++) {
1426       for (j=0; j < width; j++) {
1427         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1428       }
1429       src += stride;
1430       dst += stride;
1431     }
1432 }
1433
1434 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1435     int i,j;
1436     for (i=0; i < height; i++) {
1437       for (j=0; j < width; j++) {
1438         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1439       }
1440       src += stride;
1441       dst += stride;
1442     }
1443 }
1444
1445 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1446     int i,j;
1447     for (i=0; i < height; i++) {
1448       for (j=0; j < width; j++) {
1449         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1450       }
1451       src += stride;
1452       dst += stride;
1453     }
1454 }
1455
1456 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1457     int i,j;
1458     for (i=0; i < height; i++) {
1459       for (j=0; j < width; j++) {
1460         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1461       }
1462       src += stride;
1463       dst += stride;
1464     }
1465 }
1466 #if 0
1467 #define TPEL_WIDTH(width)\
1468 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1469     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1470 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1471     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1472 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1473     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1474 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1475     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1476 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1477     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1478 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1479     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1480 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1481     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1482 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1483     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1484 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1485     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1486 #endif
1487
1488 #define H264_CHROMA_MC(OPNAME, OP)\
1489 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1490     const int A=(8-x)*(8-y);\
1491     const int B=(  x)*(8-y);\
1492     const int C=(8-x)*(  y);\
1493     const int D=(  x)*(  y);\
1494     int i;\
1495     \
1496     assert(x<8 && y<8 && x>=0 && y>=0);\
1497 \
1498     if(D){\
1499         for(i=0; i<h; i++){\
1500             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1501             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1502             dst+= stride;\
1503             src+= stride;\
1504         }\
1505     }else{\
1506         const int E= B+C;\
1507         const int step= C ? stride : 1;\
1508         for(i=0; i<h; i++){\
1509             OP(dst[0], (A*src[0] + E*src[step+0]));\
1510             OP(dst[1], (A*src[1] + E*src[step+1]));\
1511             dst+= stride;\
1512             src+= stride;\
1513         }\
1514     }\
1515 }\
1516 \
1517 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1518     const int A=(8-x)*(8-y);\
1519     const int B=(  x)*(8-y);\
1520     const int C=(8-x)*(  y);\
1521     const int D=(  x)*(  y);\
1522     int i;\
1523     \
1524     assert(x<8 && y<8 && x>=0 && y>=0);\
1525 \
1526     if(D){\
1527         for(i=0; i<h; i++){\
1528             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1529             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1530             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1531             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1532             dst+= stride;\
1533             src+= stride;\
1534         }\
1535     }else{\
1536         const int E= B+C;\
1537         const int step= C ? stride : 1;\
1538         for(i=0; i<h; i++){\
1539             OP(dst[0], (A*src[0] + E*src[step+0]));\
1540             OP(dst[1], (A*src[1] + E*src[step+1]));\
1541             OP(dst[2], (A*src[2] + E*src[step+2]));\
1542             OP(dst[3], (A*src[3] + E*src[step+3]));\
1543             dst+= stride;\
1544             src+= stride;\
1545         }\
1546     }\
1547 }\
1548 \
1549 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1550     const int A=(8-x)*(8-y);\
1551     const int B=(  x)*(8-y);\
1552     const int C=(8-x)*(  y);\
1553     const int D=(  x)*(  y);\
1554     int i;\
1555     \
1556     assert(x<8 && y<8 && x>=0 && y>=0);\
1557 \
1558     if(D){\
1559         for(i=0; i<h; i++){\
1560             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1561             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1562             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1563             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1564             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1565             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1566             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1567             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1568             dst+= stride;\
1569             src+= stride;\
1570         }\
1571     }else{\
1572         const int E= B+C;\
1573         const int step= C ? stride : 1;\
1574         for(i=0; i<h; i++){\
1575             OP(dst[0], (A*src[0] + E*src[step+0]));\
1576             OP(dst[1], (A*src[1] + E*src[step+1]));\
1577             OP(dst[2], (A*src[2] + E*src[step+2]));\
1578             OP(dst[3], (A*src[3] + E*src[step+3]));\
1579             OP(dst[4], (A*src[4] + E*src[step+4]));\
1580             OP(dst[5], (A*src[5] + E*src[step+5]));\
1581             OP(dst[6], (A*src[6] + E*src[step+6]));\
1582             OP(dst[7], (A*src[7] + E*src[step+7]));\
1583             dst+= stride;\
1584             src+= stride;\
1585         }\
1586     }\
1587 }
1588
1589 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1590 #define op_put(a, b) a = (((b) + 32)>>6)
1591
1592 H264_CHROMA_MC(put_       , op_put)
1593 H264_CHROMA_MC(avg_       , op_avg)
1594 #undef op_avg
1595 #undef op_put
1596
1597 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1598     const int A=(8-x)*(8-y);
1599     const int B=(  x)*(8-y);
1600     const int C=(8-x)*(  y);
1601     const int D=(  x)*(  y);
1602     int i;
1603
1604     assert(x<8 && y<8 && x>=0 && y>=0);
1605
1606     for(i=0; i<h; i++)
1607     {
1608         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1609         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1610         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1611         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1612         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1613         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1614         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1615         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1616         dst+= stride;
1617         src+= stride;
1618     }
1619 }
1620
1621 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1622     const int A=(8-x)*(8-y);
1623     const int B=(  x)*(8-y);
1624     const int C=(8-x)*(  y);
1625     const int D=(  x)*(  y);
1626     int i;
1627
1628     assert(x<8 && y<8 && x>=0 && y>=0);
1629
1630     for(i=0; i<h; i++)
1631     {
1632         dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1633         dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1634         dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1635         dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1636         dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1637         dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1638         dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1639         dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1640         dst+= stride;
1641         src+= stride;
1642     }
1643 }
1644
1645 #define QPEL_MC(r, OPNAME, RND, OP) \
1646 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1647     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1648     int i;\
1649     for(i=0; i<h; i++)\
1650     {\
1651         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1652         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1653         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1654         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1655         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1656         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1657         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1658         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1659         dst+=dstStride;\
1660         src+=srcStride;\
1661     }\
1662 }\
1663 \
1664 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1665     const int w=8;\
1666     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1667     int i;\
1668     for(i=0; i<w; i++)\
1669     {\
1670         const int src0= src[0*srcStride];\
1671         const int src1= src[1*srcStride];\
1672         const int src2= src[2*srcStride];\
1673         const int src3= src[3*srcStride];\
1674         const int src4= src[4*srcStride];\
1675         const int src5= src[5*srcStride];\
1676         const int src6= src[6*srcStride];\
1677         const int src7= src[7*srcStride];\
1678         const int src8= src[8*srcStride];\
1679         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1680         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1681         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1682         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1683         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1684         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1685         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1686         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1687         dst++;\
1688         src++;\
1689     }\
1690 }\
1691 \
1692 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1693     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1694     int i;\
1695     \
1696     for(i=0; i<h; i++)\
1697     {\
1698         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1699         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1700         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1701         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1702         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1703         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1704         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1705         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1706         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1707         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1708         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1709         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1710         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1711         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1712         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1713         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1714         dst+=dstStride;\
1715         src+=srcStride;\
1716     }\
1717 }\
1718 \
1719 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1720     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1721     int i;\
1722     const int w=16;\
1723     for(i=0; i<w; i++)\
1724     {\
1725         const int src0= src[0*srcStride];\
1726         const int src1= src[1*srcStride];\
1727         const int src2= src[2*srcStride];\
1728         const int src3= src[3*srcStride];\
1729         const int src4= src[4*srcStride];\
1730         const int src5= src[5*srcStride];\
1731         const int src6= src[6*srcStride];\
1732         const int src7= src[7*srcStride];\
1733         const int src8= src[8*srcStride];\
1734         const int src9= src[9*srcStride];\
1735         const int src10= src[10*srcStride];\
1736         const int src11= src[11*srcStride];\
1737         const int src12= src[12*srcStride];\
1738         const int src13= src[13*srcStride];\
1739         const int src14= src[14*srcStride];\
1740         const int src15= src[15*srcStride];\
1741         const int src16= src[16*srcStride];\
1742         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1743         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1744         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1745         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1746         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1747         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1748         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1749         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1750         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1751         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1752         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1753         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1754         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1755         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1756         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1757         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1758         dst++;\
1759         src++;\
1760     }\
1761 }\
1762 \
1763 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1764     uint8_t half[64];\
1765     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1766     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1767 }\
1768 \
1769 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1770     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1771 }\
1772 \
1773 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1774     uint8_t half[64];\
1775     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1776     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1777 }\
1778 \
1779 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1780     uint8_t full[16*9];\
1781     uint8_t half[64];\
1782     copy_block9(full, src, 16, stride, 9);\
1783     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1784     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1785 }\
1786 \
1787 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1788     uint8_t full[16*9];\
1789     copy_block9(full, src, 16, stride, 9);\
1790     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1791 }\
1792 \
1793 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1794     uint8_t full[16*9];\
1795     uint8_t half[64];\
1796     copy_block9(full, src, 16, stride, 9);\
1797     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1798     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1799 }\
1800 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1801     uint8_t full[16*9];\
1802     uint8_t halfH[72];\
1803     uint8_t halfV[64];\
1804     uint8_t halfHV[64];\
1805     copy_block9(full, src, 16, stride, 9);\
1806     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1807     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1808     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1809     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1810 }\
1811 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1812     uint8_t full[16*9];\
1813     uint8_t halfH[72];\
1814     uint8_t halfHV[64];\
1815     copy_block9(full, src, 16, stride, 9);\
1816     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1817     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1818     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1819     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1820 }\
1821 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1822     uint8_t full[16*9];\
1823     uint8_t halfH[72];\
1824     uint8_t halfV[64];\
1825     uint8_t halfHV[64];\
1826     copy_block9(full, src, 16, stride, 9);\
1827     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1828     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1829     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1830     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1831 }\
1832 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1833     uint8_t full[16*9];\
1834     uint8_t halfH[72];\
1835     uint8_t halfHV[64];\
1836     copy_block9(full, src, 16, stride, 9);\
1837     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1838     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1839     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1840     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1841 }\
1842 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1843     uint8_t full[16*9];\
1844     uint8_t halfH[72];\
1845     uint8_t halfV[64];\
1846     uint8_t halfHV[64];\
1847     copy_block9(full, src, 16, stride, 9);\
1848     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1849     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1850     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1851     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1852 }\
1853 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1854     uint8_t full[16*9];\
1855     uint8_t halfH[72];\
1856     uint8_t halfHV[64];\
1857     copy_block9(full, src, 16, stride, 9);\
1858     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1859     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1860     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1861     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1862 }\
1863 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1864     uint8_t full[16*9];\
1865     uint8_t halfH[72];\
1866     uint8_t halfV[64];\
1867     uint8_t halfHV[64];\
1868     copy_block9(full, src, 16, stride, 9);\
1869     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1870     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1871     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1872     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1873 }\
1874 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1875     uint8_t full[16*9];\
1876     uint8_t halfH[72];\
1877     uint8_t halfHV[64];\
1878     copy_block9(full, src, 16, stride, 9);\
1879     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1880     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1881     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1882     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1883 }\
1884 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1885     uint8_t halfH[72];\
1886     uint8_t halfHV[64];\
1887     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1888     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1889     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1890 }\
1891 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1892     uint8_t halfH[72];\
1893     uint8_t halfHV[64];\
1894     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1895     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1896     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1897 }\
1898 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1899     uint8_t full[16*9];\
1900     uint8_t halfH[72];\
1901     uint8_t halfV[64];\
1902     uint8_t halfHV[64];\
1903     copy_block9(full, src, 16, stride, 9);\
1904     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1905     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1906     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1907     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1908 }\
1909 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1910     uint8_t full[16*9];\
1911     uint8_t halfH[72];\
1912     copy_block9(full, src, 16, stride, 9);\
1913     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1914     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1915     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1916 }\
1917 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1918     uint8_t full[16*9];\
1919     uint8_t halfH[72];\
1920     uint8_t halfV[64];\
1921     uint8_t halfHV[64];\
1922     copy_block9(full, src, 16, stride, 9);\
1923     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1924     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1925     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1926     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1927 }\
1928 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1929     uint8_t full[16*9];\
1930     uint8_t halfH[72];\
1931     copy_block9(full, src, 16, stride, 9);\
1932     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1933     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1934     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1935 }\
1936 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1937     uint8_t halfH[72];\
1938     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1939     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1940 }\
1941 \
1942 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1943     uint8_t half[256];\
1944     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1945     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1946 }\
1947 \
1948 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1949     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1950 }\
1951 \
1952 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1953     uint8_t half[256];\
1954     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1955     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1956 }\
1957 \
1958 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1959     uint8_t full[24*17];\
1960     uint8_t half[256];\
1961     copy_block17(full, src, 24, stride, 17);\
1962     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1963     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1964 }\
1965 \
1966 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1967     uint8_t full[24*17];\
1968     copy_block17(full, src, 24, stride, 17);\
1969     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1970 }\
1971 \
1972 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1973     uint8_t full[24*17];\
1974     uint8_t half[256];\
1975     copy_block17(full, src, 24, stride, 17);\
1976     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1977     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1978 }\
1979 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1980     uint8_t full[24*17];\
1981     uint8_t halfH[272];\
1982     uint8_t halfV[256];\
1983     uint8_t halfHV[256];\
1984     copy_block17(full, src, 24, stride, 17);\
1985     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1986     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1987     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1988     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1989 }\
1990 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1991     uint8_t full[24*17];\
1992     uint8_t halfH[272];\
1993     uint8_t halfHV[256];\
1994     copy_block17(full, src, 24, stride, 17);\
1995     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1996     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1997     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1998     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1999 }\
2000 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2001     uint8_t full[24*17];\
2002     uint8_t halfH[272];\
2003     uint8_t halfV[256];\
2004     uint8_t halfHV[256];\
2005     copy_block17(full, src, 24, stride, 17);\
2006     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2007     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2008     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2009     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2010 }\
2011 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2012     uint8_t full[24*17];\
2013     uint8_t halfH[272];\
2014     uint8_t halfHV[256];\
2015     copy_block17(full, src, 24, stride, 17);\
2016     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2017     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2018     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2019     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2020 }\
2021 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2022     uint8_t full[24*17];\
2023     uint8_t halfH[272];\
2024     uint8_t halfV[256];\
2025     uint8_t halfHV[256];\
2026     copy_block17(full, src, 24, stride, 17);\
2027     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2028     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2029     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2030     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2031 }\
2032 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2033     uint8_t full[24*17];\
2034     uint8_t halfH[272];\
2035     uint8_t halfHV[256];\
2036     copy_block17(full, src, 24, stride, 17);\
2037     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2038     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2039     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2040     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2041 }\
2042 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2043     uint8_t full[24*17];\
2044     uint8_t halfH[272];\
2045     uint8_t halfV[256];\
2046     uint8_t halfHV[256];\
2047     copy_block17(full, src, 24, stride, 17);\
2048     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2049     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2050     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2051     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2052 }\
2053 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2054     uint8_t full[24*17];\
2055     uint8_t halfH[272];\
2056     uint8_t halfHV[256];\
2057     copy_block17(full, src, 24, stride, 17);\
2058     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2059     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2060     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2061     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2062 }\
2063 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2064     uint8_t halfH[272];\
2065     uint8_t halfHV[256];\
2066     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2067     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2068     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2069 }\
2070 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2071     uint8_t halfH[272];\
2072     uint8_t halfHV[256];\
2073     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2074     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2075     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2076 }\
2077 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2078     uint8_t full[24*17];\
2079     uint8_t halfH[272];\
2080     uint8_t halfV[256];\
2081     uint8_t halfHV[256];\
2082     copy_block17(full, src, 24, stride, 17);\
2083     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2084     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2085     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2086     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2087 }\
2088 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2089     uint8_t full[24*17];\
2090     uint8_t halfH[272];\
2091     copy_block17(full, src, 24, stride, 17);\
2092     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2093     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2094     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2095 }\
2096 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2097     uint8_t full[24*17];\
2098     uint8_t halfH[272];\
2099     uint8_t halfV[256];\
2100     uint8_t halfHV[256];\
2101     copy_block17(full, src, 24, stride, 17);\
2102     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2103     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2104     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2105     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2106 }\
2107 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2108     uint8_t full[24*17];\
2109     uint8_t halfH[272];\
2110     copy_block17(full, src, 24, stride, 17);\
2111     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2112     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2113     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2114 }\
2115 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2116     uint8_t halfH[272];\
2117     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2118     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2119 }
2120
2121 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2122 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2123 #define op_put(a, b) a = cm[((b) + 16)>>5]
2124 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2125
2126 QPEL_MC(0, put_       , _       , op_put)
2127 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2128 QPEL_MC(0, avg_       , _       , op_avg)
2129 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2130 #undef op_avg
2131 #undef op_avg_no_rnd
2132 #undef op_put
2133 #undef op_put_no_rnd
2134
2135 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
2136 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
2137 #define put_qpel16_mc00_c ff_put_pixels16x16_c
2138 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2139 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
2140 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2141
2142 #if 1
2143 #define H264_LOWPASS(OPNAME, OP, OP2) \
2144 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2145     const int h=2;\
2146     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2147     int i;\
2148     for(i=0; i<h; i++)\
2149     {\
2150         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2151         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2152         dst+=dstStride;\
2153         src+=srcStride;\
2154     }\
2155 }\
2156 \
2157 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2158     const int w=2;\
2159     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2160     int i;\
2161     for(i=0; i<w; i++)\
2162     {\
2163         const int srcB= src[-2*srcStride];\
2164         const int srcA= src[-1*srcStride];\
2165         const int src0= src[0 *srcStride];\
2166         const int src1= src[1 *srcStride];\
2167         const int src2= src[2 *srcStride];\
2168         const int src3= src[3 *srcStride];\
2169         const int src4= src[4 *srcStride];\
2170         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2171         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2172         dst++;\
2173         src++;\
2174     }\
2175 }\
2176 \
2177 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2178     const int h=2;\
2179     const int w=2;\
2180     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2181     int i;\
2182     src -= 2*srcStride;\
2183     for(i=0; i<h+5; i++)\
2184     {\
2185         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2186         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2187         tmp+=tmpStride;\
2188         src+=srcStride;\
2189     }\
2190     tmp -= tmpStride*(h+5-2);\
2191     for(i=0; i<w; i++)\
2192     {\
2193         const int tmpB= tmp[-2*tmpStride];\
2194         const int tmpA= tmp[-1*tmpStride];\
2195         const int tmp0= tmp[0 *tmpStride];\
2196         const int tmp1= tmp[1 *tmpStride];\
2197         const int tmp2= tmp[2 *tmpStride];\
2198         const int tmp3= tmp[3 *tmpStride];\
2199         const int tmp4= tmp[4 *tmpStride];\
2200         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2201         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2202         dst++;\
2203         tmp++;\
2204     }\
2205 }\
2206 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2207     const int h=4;\
2208     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2209     int i;\
2210     for(i=0; i<h; i++)\
2211     {\
2212         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2213         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2214         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2215         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2216         dst+=dstStride;\
2217         src+=srcStride;\
2218     }\
2219 }\
2220 \
2221 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2222     const int w=4;\
2223     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2224     int i;\
2225     for(i=0; i<w; i++)\
2226     {\
2227         const int srcB= src[-2*srcStride];\
2228         const int srcA= src[-1*srcStride];\
2229         const int src0= src[0 *srcStride];\
2230         const int src1= src[1 *srcStride];\
2231         const int src2= src[2 *srcStride];\
2232         const int src3= src[3 *srcStride];\
2233         const int src4= src[4 *srcStride];\
2234         const int src5= src[5 *srcStride];\
2235         const int src6= src[6 *srcStride];\
2236         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2237         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2238         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2239         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2240         dst++;\
2241         src++;\
2242     }\
2243 }\
2244 \
2245 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2246     const int h=4;\
2247     const int w=4;\
2248     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2249     int i;\
2250     src -= 2*srcStride;\
2251     for(i=0; i<h+5; i++)\
2252     {\
2253         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2254         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2255         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2256         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2257         tmp+=tmpStride;\
2258         src+=srcStride;\
2259     }\
2260     tmp -= tmpStride*(h+5-2);\
2261     for(i=0; i<w; i++)\
2262     {\
2263         const int tmpB= tmp[-2*tmpStride];\
2264         const int tmpA= tmp[-1*tmpStride];\
2265         const int tmp0= tmp[0 *tmpStride];\
2266         const int tmp1= tmp[1 *tmpStride];\
2267         const int tmp2= tmp[2 *tmpStride];\
2268         const int tmp3= tmp[3 *tmpStride];\
2269         const int tmp4= tmp[4 *tmpStride];\
2270         const int tmp5= tmp[5 *tmpStride];\
2271         const int tmp6= tmp[6 *tmpStride];\
2272         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2273         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2274         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2275         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2276         dst++;\
2277         tmp++;\
2278     }\
2279 }\
2280 \
2281 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2282     const int h=8;\
2283     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2284     int i;\
2285     for(i=0; i<h; i++)\
2286     {\
2287         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2288         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2289         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2290         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2291         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2292         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2293         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2294         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2295         dst+=dstStride;\
2296         src+=srcStride;\
2297     }\
2298 }\
2299 \
2300 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2301     const int w=8;\
2302     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2303     int i;\
2304     for(i=0; i<w; i++)\
2305     {\
2306         const int srcB= src[-2*srcStride];\
2307         const int srcA= src[-1*srcStride];\
2308         const int src0= src[0 *srcStride];\
2309         const int src1= src[1 *srcStride];\
2310         const int src2= src[2 *srcStride];\
2311         const int src3= src[3 *srcStride];\
2312         const int src4= src[4 *srcStride];\
2313         const int src5= src[5 *srcStride];\
2314         const int src6= src[6 *srcStride];\
2315         const int src7= src[7 *srcStride];\
2316         const int src8= src[8 *srcStride];\
2317         const int src9= src[9 *srcStride];\
2318         const int src10=src[10*srcStride];\
2319         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2320         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2321         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2322         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2323         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2324         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2325         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2326         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2327         dst++;\
2328         src++;\
2329     }\
2330 }\
2331 \
2332 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2333     const int h=8;\
2334     const int w=8;\
2335     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2336     int i;\
2337     src -= 2*srcStride;\
2338     for(i=0; i<h+5; i++)\
2339     {\
2340         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2341         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2342         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2343         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2344         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2345         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2346         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2347         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2348         tmp+=tmpStride;\
2349         src+=srcStride;\
2350     }\
2351     tmp -= tmpStride*(h+5-2);\
2352     for(i=0; i<w; i++)\
2353     {\
2354         const int tmpB= tmp[-2*tmpStride];\
2355         const int tmpA= tmp[-1*tmpStride];\
2356         const int tmp0= tmp[0 *tmpStride];\
2357         const int tmp1= tmp[1 *tmpStride];\
2358         const int tmp2= tmp[2 *tmpStride];\
2359         const int tmp3= tmp[3 *tmpStride];\
2360         const int tmp4= tmp[4 *tmpStride];\
2361         const int tmp5= tmp[5 *tmpStride];\
2362         const int tmp6= tmp[6 *tmpStride];\
2363         const int tmp7= tmp[7 *tmpStride];\
2364         const int tmp8= tmp[8 *tmpStride];\
2365         const int tmp9= tmp[9 *tmpStride];\
2366         const int tmp10=tmp[10*tmpStride];\
2367         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2368         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2369         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2370         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2371         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2372         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2373         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2374         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2375         dst++;\
2376         tmp++;\
2377     }\
2378 }\
2379 \
2380 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2381     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2382     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2383     src += 8*srcStride;\
2384     dst += 8*dstStride;\
2385     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2386     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2387 }\
2388 \
2389 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2390     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2391     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2392     src += 8*srcStride;\
2393     dst += 8*dstStride;\
2394     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2395     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2396 }\
2397 \
2398 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2399     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2400     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2401     src += 8*srcStride;\
2402     dst += 8*dstStride;\
2403     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2404     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2405 }\
2406
2407 #define H264_MC(OPNAME, SIZE) \
2408 static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2409     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2410 }\
2411 \
2412 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2413     uint8_t half[SIZE*SIZE];\
2414     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2415     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2416 }\
2417 \
2418 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2419     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2420 }\
2421 \
2422 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2423     uint8_t half[SIZE*SIZE];\
2424     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2425     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2426 }\
2427 \
2428 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2429     uint8_t full[SIZE*(SIZE+5)];\
2430     uint8_t * const full_mid= full + SIZE*2;\
2431     uint8_t half[SIZE*SIZE];\
2432     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2433     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2434     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2435 }\
2436 \
2437 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2438     uint8_t full[SIZE*(SIZE+5)];\
2439     uint8_t * const full_mid= full + SIZE*2;\
2440     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2441     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2442 }\
2443 \
2444 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2445     uint8_t full[SIZE*(SIZE+5)];\
2446     uint8_t * const full_mid= full + SIZE*2;\
2447     uint8_t half[SIZE*SIZE];\
2448     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2449     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2450     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2451 }\
2452 \
2453 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2454     uint8_t full[SIZE*(SIZE+5)];\
2455     uint8_t * const full_mid= full + SIZE*2;\
2456     uint8_t halfH[SIZE*SIZE];\
2457     uint8_t halfV[SIZE*SIZE];\
2458     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2459     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2460     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2461     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2462 }\
2463 \
2464 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2465     uint8_t full[SIZE*(SIZE+5)];\
2466     uint8_t * const full_mid= full + SIZE*2;\
2467     uint8_t halfH[SIZE*SIZE];\
2468     uint8_t halfV[SIZE*SIZE];\
2469     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2470     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2471     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2472     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2473 }\
2474 \
2475 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2476     uint8_t full[SIZE*(SIZE+5)];\
2477     uint8_t * const full_mid= full + SIZE*2;\
2478     uint8_t halfH[SIZE*SIZE];\
2479     uint8_t halfV[SIZE*SIZE];\
2480     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2481     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2482     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2483     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2484 }\
2485 \
2486 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2487     uint8_t full[SIZE*(SIZE+5)];\
2488     uint8_t * const full_mid= full + SIZE*2;\
2489     uint8_t halfH[SIZE*SIZE];\
2490     uint8_t halfV[SIZE*SIZE];\
2491     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2492     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2493     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2494     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2495 }\
2496 \
2497 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2498     int16_t tmp[SIZE*(SIZE+5)];\
2499     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2500 }\
2501 \
2502 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2503     int16_t tmp[SIZE*(SIZE+5)];\
2504     uint8_t halfH[SIZE*SIZE];\
2505     uint8_t halfHV[SIZE*SIZE];\
2506     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2507     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2508     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2509 }\
2510 \
2511 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2512     int16_t tmp[SIZE*(SIZE+5)];\
2513     uint8_t halfH[SIZE*SIZE];\
2514     uint8_t halfHV[SIZE*SIZE];\
2515     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2516     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2517     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2518 }\
2519 \
2520 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2521     uint8_t full[SIZE*(SIZE+5)];\
2522     uint8_t * const full_mid= full + SIZE*2;\
2523     int16_t tmp[SIZE*(SIZE+5)];\
2524     uint8_t halfV[SIZE*SIZE];\
2525     uint8_t halfHV[SIZE*SIZE];\
2526     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2527     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2528     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2529     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2530 }\
2531 \
2532 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2533     uint8_t full[SIZE*(SIZE+5)];\
2534     uint8_t * const full_mid= full + SIZE*2;\
2535     int16_t tmp[SIZE*(SIZE+5)];\
2536     uint8_t halfV[SIZE*SIZE];\
2537     uint8_t halfHV[SIZE*SIZE];\
2538     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2539     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2540     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2541     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2542 }\
2543
2544 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2545 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2546 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2547 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2548 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2549
2550 H264_LOWPASS(put_       , op_put, op2_put)
2551 H264_LOWPASS(avg_       , op_avg, op2_avg)
2552 H264_MC(put_, 2)
2553 H264_MC(put_, 4)
2554 H264_MC(put_, 8)
2555 H264_MC(put_, 16)
2556 H264_MC(avg_, 4)
2557 H264_MC(avg_, 8)
2558 H264_MC(avg_, 16)
2559
2560 #undef op_avg
2561 #undef op_put
2562 #undef op2_avg
2563 #undef op2_put
2564 #endif
2565
2566 #define put_h264_qpel8_mc00_c  ff_put_pixels8x8_c
2567 #define avg_h264_qpel8_mc00_c  ff_avg_pixels8x8_c
2568 #define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2569 #define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2570
2571 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2572     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2573     int i;
2574
2575     for(i=0; i<h; i++){
2576         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2577         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2578         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2579         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2580         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2581         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2582         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2583         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2584         dst+=dstStride;
2585         src+=srcStride;
2586     }
2587 }
2588
2589 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2590     put_pixels8_c(dst, src, stride, 8);
2591 }
2592 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2593     avg_pixels8_c(dst, src, stride, 8);
2594 }
2595 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2596     put_pixels16_c(dst, src, stride, 16);
2597 }
2598 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2599     avg_pixels16_c(dst, src, stride, 16);
2600 }
2601
2602 #if CONFIG_RV40_DECODER
2603 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2604     put_pixels16_xy2_c(dst, src, stride, 16);
2605 }
2606 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2607     avg_pixels16_xy2_c(dst, src, stride, 16);
2608 }
2609 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2610     put_pixels8_xy2_c(dst, src, stride, 8);
2611 }
2612 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2613     avg_pixels8_xy2_c(dst, src, stride, 8);
2614 }
2615 #endif /* CONFIG_RV40_DECODER */
2616
2617 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2618     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2619     int i;
2620
2621     for(i=0; i<w; i++){
2622         const int src_1= src[ -srcStride];
2623         const int src0 = src[0          ];
2624         const int src1 = src[  srcStride];
2625         const int src2 = src[2*srcStride];
2626         const int src3 = src[3*srcStride];
2627         const int src4 = src[4*srcStride];
2628         const int src5 = src[5*srcStride];
2629         const int src6 = src[6*srcStride];
2630         const int src7 = src[7*srcStride];
2631         const int src8 = src[8*srcStride];
2632         const int src9 = src[9*srcStride];
2633         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2634         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2635         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2636         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2637         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2638         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2639         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2640         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2641         src++;
2642         dst++;
2643     }
2644 }
2645
2646 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2647     uint8_t half[64];
2648     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2649     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2650 }
2651
2652 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2653     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2654 }
2655
2656 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2657     uint8_t half[64];
2658     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2659     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2660 }
2661
2662 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2663     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2664 }
2665
2666 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2667     uint8_t halfH[88];
2668     uint8_t halfV[64];
2669     uint8_t halfHV[64];
2670     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2671     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2672     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2673     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2674 }
2675 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2676     uint8_t halfH[88];
2677     uint8_t halfV[64];
2678     uint8_t halfHV[64];
2679     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2680     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2681     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2682     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2683 }
2684 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2685     uint8_t halfH[88];
2686     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2687     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2688 }
2689
2690 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2691     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2692     int x;
2693     const int strength= ff_h263_loop_filter_strength[qscale];
2694
2695     for(x=0; x<8; x++){
2696         int d1, d2, ad1;
2697         int p0= src[x-2*stride];
2698         int p1= src[x-1*stride];
2699         int p2= src[x+0*stride];
2700         int p3= src[x+1*stride];
2701         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2702
2703         if     (d<-2*strength) d1= 0;
2704         else if(d<-  strength) d1=-2*strength - d;
2705         else if(d<   strength) d1= d;
2706         else if(d< 2*strength) d1= 2*strength - d;
2707         else                   d1= 0;
2708
2709         p1 += d1;
2710         p2 -= d1;
2711         if(p1&256) p1= ~(p1>>31);
2712         if(p2&256) p2= ~(p2>>31);
2713
2714         src[x-1*stride] = p1;
2715         src[x+0*stride] = p2;
2716
2717         ad1= FFABS(d1)>>1;
2718
2719         d2= av_clip((p0-p3)/4, -ad1, ad1);
2720
2721         src[x-2*stride] = p0 - d2;
2722         src[x+  stride] = p3 + d2;
2723     }
2724     }
2725 }
2726
2727 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2728     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2729     int y;
2730     const int strength= ff_h263_loop_filter_strength[qscale];
2731
2732     for(y=0; y<8; y++){
2733         int d1, d2, ad1;
2734         int p0= src[y*stride-2];
2735         int p1= src[y*stride-1];
2736         int p2= src[y*stride+0];
2737         int p3= src[y*stride+1];
2738         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2739
2740         if     (d<-2*strength) d1= 0;
2741         else if(d<-  strength) d1=-2*strength - d;
2742         else if(d<   strength) d1= d;
2743         else if(d< 2*strength) d1= 2*strength - d;
2744         else                   d1= 0;
2745
2746         p1 += d1;
2747         p2 -= d1;
2748         if(p1&256) p1= ~(p1>>31);
2749         if(p2&256) p2= ~(p2>>31);
2750
2751         src[y*stride-1] = p1;
2752         src[y*stride+0] = p2;
2753
2754         ad1= FFABS(d1)>>1;
2755
2756         d2= av_clip((p0-p3)/4, -ad1, ad1);
2757
2758         src[y*stride-2] = p0 - d2;
2759         src[y*stride+1] = p3 + d2;
2760     }
2761     }
2762 }
2763
2764 static void h261_loop_filter_c(uint8_t *src, int stride){
2765     int x,y,xy,yz;
2766     int temp[64];
2767
2768     for(x=0; x<8; x++){
2769         temp[x      ] = 4*src[x           ];
2770         temp[x + 7*8] = 4*src[x + 7*stride];
2771     }
2772     for(y=1; y<7; y++){
2773         for(x=0; x<8; x++){
2774             xy = y * stride + x;
2775             yz = y * 8 + x;
2776             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2777         }
2778     }
2779
2780     for(y=0; y<8; y++){
2781         src[  y*stride] = (temp[  y*8] + 2)>>2;
2782         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2783         for(x=1; x<7; x++){
2784             xy = y * stride + x;
2785             yz = y * 8 + x;
2786             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2787         }
2788     }
2789 }
2790
2791 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2792 {
2793     int s, i;
2794
2795     s = 0;
2796     for(i=0;i<h;i++) {
2797         s += abs(pix1[0] - pix2[0]);
2798         s += abs(pix1[1] - pix2[1]);
2799         s += abs(pix1[2] - pix2[2]);
2800         s += abs(pix1[3] - pix2[3]);
2801         s += abs(pix1[4] - pix2[4]);
2802         s += abs(pix1[5] - pix2[5]);
2803         s += abs(pix1[6] - pix2[6]);
2804         s += abs(pix1[7] - pix2[7]);
2805         s += abs(pix1[8] - pix2[8]);
2806         s += abs(pix1[9] - pix2[9]);
2807         s += abs(pix1[10] - pix2[10]);
2808         s += abs(pix1[11] - pix2[11]);
2809         s += abs(pix1[12] - pix2[12]);
2810         s += abs(pix1[13] - pix2[13]);
2811         s += abs(pix1[14] - pix2[14]);
2812         s += abs(pix1[15] - pix2[15]);
2813         pix1 += line_size;
2814         pix2 += line_size;
2815     }
2816     return s;
2817 }
2818
2819 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2820 {
2821     int s, i;
2822
2823     s = 0;
2824     for(i=0;i<h;i++) {
2825         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2826         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2827         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2828         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2829         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2830         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2831         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2832         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2833         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2834         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2835         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2836         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2837         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2838         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2839         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2840         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2841         pix1 += line_size;
2842         pix2 += line_size;
2843     }
2844     return s;
2845 }
2846
2847 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2848 {
2849     int s, i;
2850     uint8_t *pix3 = pix2 + line_size;
2851
2852     s = 0;
2853     for(i=0;i<h;i++) {
2854         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2855         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2856         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2857         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2858         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2859         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2860         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2861         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2862         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2863         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2864         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2865         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2866         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2867         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2868         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2869         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2870         pix1 += line_size;
2871         pix2 += line_size;
2872         pix3 += line_size;
2873     }
2874     return s;
2875 }
2876
2877 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2878 {
2879     int s, i;
2880     uint8_t *pix3 = pix2 + line_size;
2881
2882     s = 0;
2883     for(i=0;i<h;i++) {
2884         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2885         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2886         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2887         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2888         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2889         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2890         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2891         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2892         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2893         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2894         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2895         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2896         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2897         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2898         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2899         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2900         pix1 += line_size;
2901         pix2 += line_size;
2902         pix3 += line_size;
2903     }
2904     return s;
2905 }
2906
2907 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2908 {
2909     int s, i;
2910
2911     s = 0;
2912     for(i=0;i<h;i++) {
2913         s += abs(pix1[0] - pix2[0]);
2914         s += abs(pix1[1] - pix2[1]);
2915         s += abs(pix1[2] - pix2[2]);
2916         s += abs(pix1[3] - pix2[3]);
2917         s += abs(pix1[4] - pix2[4]);
2918         s += abs(pix1[5] - pix2[5]);
2919         s += abs(pix1[6] - pix2[6]);
2920         s += abs(pix1[7] - pix2[7]);
2921         pix1 += line_size;
2922         pix2 += line_size;
2923     }
2924     return s;
2925 }
2926
2927 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2928 {
2929     int s, i;
2930
2931     s = 0;
2932     for(i=0;i<h;i++) {
2933         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2934         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2935         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2936         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2937         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2938         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2939         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2940         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2941         pix1 += line_size;
2942         pix2 += line_size;
2943     }
2944     return s;
2945 }
2946
2947 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2948 {
2949     int s, i;
2950     uint8_t *pix3 = pix2 + line_size;
2951
2952     s = 0;
2953     for(i=0;i<h;i++) {
2954         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2955         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2956         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2957         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2958         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2959         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2960         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2961         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2962         pix1 += line_size;
2963         pix2 += line_size;
2964         pix3 += line_size;
2965     }
2966     return s;
2967 }
2968
2969 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2970 {
2971     int s, i;
2972     uint8_t *pix3 = pix2 + line_size;
2973
2974     s = 0;
2975     for(i=0;i<h;i++) {
2976         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2977         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2978         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2979         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2980         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2981         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2982         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2983         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2984         pix1 += line_size;
2985         pix2 += line_size;
2986         pix3 += line_size;
2987     }
2988     return s;
2989 }
2990
2991 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2992     MpegEncContext *c = v;
2993     int score1=0;
2994     int score2=0;
2995     int x,y;
2996
2997     for(y=0; y<h; y++){
2998         for(x=0; x<16; x++){
2999             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3000         }
3001         if(y+1<h){
3002             for(x=0; x<15; x++){
3003                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3004                              - s1[x+1] + s1[x+1+stride])
3005                         -FFABS(  s2[x  ] - s2[x  +stride]
3006                              - s2[x+1] + s2[x+1+stride]);
3007             }
3008         }
3009         s1+= stride;
3010         s2+= stride;
3011     }
3012
3013     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3014     else  return score1 + FFABS(score2)*8;
3015 }
3016
3017 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3018     MpegEncContext *c = v;
3019     int score1=0;
3020     int score2=0;
3021     int x,y;
3022
3023     for(y=0; y<h; y++){
3024         for(x=0; x<8; x++){
3025             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3026         }
3027         if(y+1<h){
3028             for(x=0; x<7; x++){
3029                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3030                              - s1[x+1] + s1[x+1+stride])
3031                         -FFABS(  s2[x  ] - s2[x  +stride]
3032                              - s2[x+1] + s2[x+1+stride]);
3033             }
3034         }
3035         s1+= stride;
3036         s2+= stride;
3037     }
3038
3039     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3040     else  return score1 + FFABS(score2)*8;
3041 }
3042
3043 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3044     int i;
3045     unsigned int sum=0;
3046
3047     for(i=0; i<8*8; i++){
3048         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3049         int w= weight[i];
3050         b>>= RECON_SHIFT;
3051         assert(-512<b && b<512);
3052
3053         sum += (w*b)*(w*b)>>4;
3054     }
3055     return sum>>2;
3056 }
3057
3058 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3059     int i;
3060
3061     for(i=0; i<8*8; i++){
3062         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3063     }
3064 }
3065
3066 /**
3067  * permutes an 8x8 block.
3068  * @param block the block which will be permuted according to the given permutation vector
3069  * @param permutation the permutation vector
3070  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3071  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3072  *                  (inverse) permutated to scantable order!
3073  */
3074 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3075 {
3076     int i;
3077     DCTELEM temp[64];
3078
3079     if(last<=0) return;
3080     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3081
3082     for(i=0; i<=last; i++){
3083         const int j= scantable[i];
3084         temp[j]= block[j];
3085         block[j]=0;
3086     }
3087
3088     for(i=0; i<=last; i++){
3089         const int j= scantable[i];
3090         const int perm_j= permutation[j];
3091         block[perm_j]= temp[j];
3092     }
3093 }
3094
3095 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3096     return 0;
3097 }
3098
3099 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3100     int i;
3101
3102     memset(cmp, 0, sizeof(void*)*6);
3103
3104     for(i=0; i<6; i++){
3105         switch(type&0xFF){
3106         case FF_CMP_SAD:
3107             cmp[i]= c->sad[i];
3108             break;
3109         case FF_CMP_SATD:
3110             cmp[i]= c->hadamard8_diff[i];
3111             break;
3112         case FF_CMP_SSE:
3113             cmp[i]= c->sse[i];
3114             break;
3115         case FF_CMP_DCT:
3116             cmp[i]= c->dct_sad[i];
3117             break;
3118         case FF_CMP_DCT264:
3119             cmp[i]= c->dct264_sad[i];
3120             break;
3121         case FF_CMP_DCTMAX:
3122             cmp[i]= c->dct_max[i];
3123             break;
3124         case FF_CMP_PSNR:
3125             cmp[i]= c->quant_psnr[i];
3126             break;
3127         case FF_CMP_BIT:
3128             cmp[i]= c->bit[i];
3129             break;
3130         case FF_CMP_RD:
3131             cmp[i]= c->rd[i];
3132             break;
3133         case FF_CMP_VSAD:
3134             cmp[i]= c->vsad[i];
3135             break;
3136         case FF_CMP_VSSE:
3137             cmp[i]= c->vsse[i];
3138             break;
3139         case FF_CMP_ZERO:
3140             cmp[i]= zero_cmp;
3141             break;
3142         case FF_CMP_NSSE:
3143             cmp[i]= c->nsse[i];
3144             break;
3145 #if CONFIG_DWT
3146         case FF_CMP_W53:
3147             cmp[i]= c->w53[i];
3148             break;
3149         case FF_CMP_W97:
3150             cmp[i]= c->w97[i];
3151             break;
3152 #endif
3153         default:
3154             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3155         }
3156     }
3157 }
3158
3159 static void clear_block_c(DCTELEM *block)
3160 {
3161     memset(block, 0, sizeof(DCTELEM)*64);
3162 }
3163
3164 /**
3165  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3166  */
3167 static void clear_blocks_c(DCTELEM *blocks)
3168 {
3169     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3170 }
3171
3172 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3173     long i;
3174     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3175         long a = *(long*)(src+i);
3176         long b = *(long*)(dst+i);
3177         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3178     }
3179     for(; i<w; i++)
3180         dst[i+0] += src[i+0];
3181 }
3182
3183 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3184     long i;
3185     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3186         long a = *(long*)(src1+i);
3187         long b = *(long*)(src2+i);
3188         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3189     }
3190     for(; i<w; i++)
3191         dst[i] = src1[i]+src2[i];
3192 }
3193
3194 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3195     long i;
3196 #if !HAVE_FAST_UNALIGNED
3197     if((long)src2 & (sizeof(long)-1)){
3198         for(i=0; i+7<w; i+=8){
3199             dst[i+0] = src1[i+0]-src2[i+0];
3200             dst[i+1] = src1[i+1]-src2[i+1];
3201             dst[i+2] = src1[i+2]-src2[i+2];
3202             dst[i+3] = src1[i+3]-src2[i+3];
3203             dst[i+4] = src1[i+4]-src2[i+4];
3204             dst[i+5] = src1[i+5]-src2[i+5];
3205             dst[i+6] = src1[i+6]-src2[i+6];
3206             dst[i+7] = src1[i+7]-src2[i+7];
3207         }
3208     }else
3209 #endif
3210     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3211         long a = *(long*)(src1+i);
3212         long b = *(long*)(src2+i);
3213         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3214     }
3215     for(; i<w; i++)
3216         dst[i+0] = src1[i+0]-src2[i+0];
3217 }
3218
3219 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3220     int i;
3221     uint8_t l, lt;
3222
3223     l= *left;
3224     lt= *left_top;
3225
3226     for(i=0; i<w; i++){
3227         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3228         lt= src1[i];
3229         dst[i]= l;
3230     }
3231
3232     *left= l;
3233     *left_top= lt;
3234 }
3235
3236 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3237     int i;
3238     uint8_t l, lt;
3239
3240     l= *left;
3241     lt= *left_top;
3242
3243     for(i=0; i<w; i++){
3244         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3245         lt= src1[i];
3246         l= src2[i];
3247         dst[i]= l - pred;
3248     }
3249
3250     *left= l;
3251     *left_top= lt;
3252 }
3253
3254 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3255     int i;
3256
3257     for(i=0; i<w-1; i++){
3258         acc+= src[i];
3259         dst[i]= acc;
3260         i++;
3261         acc+= src[i];
3262         dst[i]= acc;
3263     }
3264
3265     for(; i<w; i++){
3266         acc+= src[i];
3267         dst[i]= acc;
3268     }
3269
3270     return acc;
3271 }
3272
3273 #if HAVE_BIGENDIAN
3274 #define B 3
3275 #define G 2
3276 #define R 1
3277 #define A 0
3278 #else
3279 #define B 0
3280 #define G 1
3281 #define R 2
3282 #define A 3
3283 #endif
3284 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3285     int i;
3286     int r,g,b,a;
3287     r= *red;
3288     g= *green;
3289     b= *blue;
3290     a= *alpha;
3291
3292     for(i=0; i<w; i++){
3293         b+= src[4*i+B];
3294         g+= src[4*i+G];
3295         r+= src[4*i+R];
3296         a+= src[4*i+A];
3297
3298         dst[4*i+B]= b;
3299         dst[4*i+G]= g;
3300         dst[4*i+R]= r;
3301         dst[4*i+A]= a;
3302     }
3303
3304     *red= r;
3305     *green= g;
3306     *blue= b;
3307     *alpha= a;
3308 }
3309 #undef B
3310 #undef G
3311 #undef R
3312 #undef A
3313
3314 #define BUTTERFLY2(o1,o2,i1,i2) \
3315 o1= (i1)+(i2);\
3316 o2= (i1)-(i2);
3317
3318 #define BUTTERFLY1(x,y) \
3319 {\
3320     int a,b;\
3321     a= x;\
3322     b= y;\
3323     x= a+b;\
3324     y= a-b;\
3325 }
3326
3327 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3328
3329 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3330     int i;
3331     int temp[64];
3332     int sum=0;
3333
3334     assert(h==8);
3335
3336     for(i=0; i<8; i++){
3337         //FIXME try pointer walks
3338         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3339         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3340         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3341         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3342
3343         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3344         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3345         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3346         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3347
3348         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3349         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3350         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3351         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3352     }
3353
3354     for(i=0; i<8; i++){
3355         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3356         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3357         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3358         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3359
3360         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3361         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3362         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3363         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3364
3365         sum +=
3366              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3367             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3368             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3369             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3370     }
3371 #if 0
3372 static int maxi=0;
3373 if(sum>maxi){
3374     maxi=sum;
3375     printf("MAX:%d\n", maxi);
3376 }
3377 #endif
3378     return sum;
3379 }
3380
3381 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3382     int i;
3383     int temp[64];
3384     int sum=0;
3385
3386     assert(h==8);
3387
3388     for(i=0; i<8; i++){
3389         //FIXME try pointer walks
3390         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3391         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3392         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3393         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3394
3395         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3396         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3397         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3398         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3399
3400         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3401         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3402         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3403         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3404     }
3405
3406     for(i=0; i<8; i++){
3407         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3408         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3409         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3410         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3411
3412         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3413         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3414         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3415         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3416
3417         sum +=
3418              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3419             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3420             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3421             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3422     }
3423
3424     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3425
3426     return sum;
3427 }
3428
3429 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3430     MpegEncContext * const s= (MpegEncContext *)c;
3431     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3432
3433     assert(h==8);
3434
3435     s->dsp.diff_pixels(temp, src1, src2, stride);
3436     s->dsp.fdct(temp);
3437     return s->dsp.sum_abs_dctelem(temp);
3438 }
3439
3440 #if CONFIG_GPL
3441 #define DCT8_1D {\
3442     const int s07 = SRC(0) + SRC(7);\
3443     const int s16 = SRC(1) + SRC(6);\
3444     const int s25 = SRC(2) + SRC(5);\
3445     const int s34 = SRC(3) + SRC(4);\
3446     const int a0 = s07 + s34;\
3447     const int a1 = s16 + s25;\
3448     const int a2 = s07 - s34;\
3449     const int a3 = s16 - s25;\
3450     const int d07 = SRC(0) - SRC(7);\
3451     const int d16 = SRC(1) - SRC(6);\
3452     const int d25 = SRC(2) - SRC(5);\
3453     const int d34 = SRC(3) - SRC(4);\
3454     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3455     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3456     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3457     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3458     DST(0,  a0 + a1     ) ;\
3459     DST(1,  a4 + (a7>>2)) ;\
3460     DST(2,  a2 + (a3>>1)) ;\
3461     DST(3,  a5 + (a6>>2)) ;\
3462     DST(4,  a0 - a1     ) ;\
3463     DST(5,  a6 - (a5>>2)) ;\
3464     DST(6, (a2>>1) - a3 ) ;\
3465     DST(7, (a4>>2) - a7 ) ;\
3466 }
3467
3468 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3469     MpegEncContext * const s= (MpegEncContext *)c;
3470     DCTELEM dct[8][8];
3471     int i;
3472     int sum=0;
3473
3474     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3475
3476 #define SRC(x) dct[i][x]
3477 #define DST(x,v) dct[i][x]= v
3478     for( i = 0; i < 8; i++ )
3479         DCT8_1D
3480 #undef SRC
3481 #undef DST
3482
3483 #define SRC(x) dct[x][i]
3484 #define DST(x,v) sum += FFABS(v)
3485     for( i = 0; i < 8; i++ )
3486         DCT8_1D
3487 #undef SRC
3488 #undef DST
3489     return sum;
3490 }
3491 #endif
3492
3493 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3494     MpegEncContext * const s= (MpegEncContext *)c;
3495     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3496     int sum=0, i;
3497
3498     assert(h==8);
3499
3500     s->dsp.diff_pixels(temp, src1, src2, stride);
3501     s->dsp.fdct(temp);
3502
3503     for(i=0; i<64; i++)
3504         sum= FFMAX(sum, FFABS(temp[i]));
3505
3506     return sum;
3507 }
3508
3509 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3510     MpegEncContext * const s= (MpegEncContext *)c;
3511     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3512     DCTELEM * const bak = temp+64;
3513     int sum=0, i;
3514
3515     assert(h==8);
3516     s->mb_intra=0;
3517
3518     s->dsp.diff_pixels(temp, src1, src2, stride);
3519
3520     memcpy(bak, temp, 64*sizeof(DCTELEM));
3521
3522     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3523     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3524     ff_simple_idct(temp); //FIXME
3525
3526     for(i=0; i<64; i++)
3527         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3528
3529     return sum;
3530 }
3531
3532 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3533     MpegEncContext * const s= (MpegEncContext *)c;
3534     const uint8_t *scantable= s->intra_scantable.permutated;
3535     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3536     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3537     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3538     int i, last, run, bits, level, distortion, start_i;
3539     const int esc_length= s->ac_esc_length;
3540     uint8_t * length;
3541     uint8_t * last_length;
3542
3543     assert(h==8);
3544
3545     copy_block8(lsrc1, src1, 8, stride, 8);
3546     copy_block8(lsrc2, src2, 8, stride, 8);
3547
3548     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3549
3550     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3551
3552     bits=0;
3553
3554     if (s->mb_intra) {
3555         start_i = 1;
3556         length     = s->intra_ac_vlc_length;
3557         last_length= s->intra_ac_vlc_last_length;
3558         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3559     } else {
3560         start_i = 0;
3561         length     = s->inter_ac_vlc_length;
3562         last_length= s->inter_ac_vlc_last_length;
3563     }
3564
3565     if(last>=start_i){
3566         run=0;
3567         for(i=start_i; i<last; i++){
3568             int j= scantable[i];
3569             level= temp[j];
3570
3571             if(level){
3572                 level+=64;
3573                 if((level&(~127)) == 0){
3574                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3575                 }else
3576                     bits+= esc_length;
3577                 run=0;
3578             }else
3579                 run++;
3580         }
3581         i= scantable[last];
3582
3583         level= temp[i] + 64;
3584
3585         assert(level - 64);
3586
3587         if((level&(~127)) == 0){
3588             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3589         }else
3590             bits+= esc_length;
3591
3592     }
3593
3594     if(last>=0){
3595         if(s->mb_intra)
3596             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3597         else
3598             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3599     }
3600
3601     s->dsp.idct_add(lsrc2, 8, temp);
3602
3603     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3604
3605     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3606 }
3607
3608 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3609     MpegEncContext * const s= (MpegEncContext *)c;
3610     const uint8_t *scantable= s->intra_scantable.permutated;
3611     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3612     int i, last, run, bits, level, start_i;
3613     const int esc_length= s->ac_esc_length;
3614     uint8_t * length;
3615     uint8_t * last_length;
3616
3617     assert(h==8);
3618
3619     s->dsp.diff_pixels(temp, src1, src2, stride);
3620
3621     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3622
3623     bits=0;
3624
3625     if (s->mb_intra) {
3626         start_i = 1;
3627         length     = s->intra_ac_vlc_length;
3628         last_length= s->intra_ac_vlc_last_length;
3629         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3630     } else {
3631         start_i = 0;
3632         length     = s->inter_ac_vlc_length;
3633         last_length= s->inter_ac_vlc_last_length;
3634     }
3635
3636     if(last>=start_i){
3637         run=0;
3638         for(i=start_i; i<last; i++){
3639             int j= scantable[i];
3640             level= temp[j];
3641
3642             if(level){
3643                 level+=64;
3644                 if((level&(~127)) == 0){
3645                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3646                 }else
3647                     bits+= esc_length;
3648                 run=0;
3649             }else
3650                 run++;
3651         }
3652         i= scantable[last];
3653
3654         level= temp[i] + 64;
3655
3656         assert(level - 64);
3657
3658         if((level&(~127)) == 0){
3659             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3660         }else
3661             bits+= esc_length;
3662     }
3663
3664     return bits;
3665 }
3666
3667 #define VSAD_INTRA(size) \
3668 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3669     int score=0;                                                                                            \
3670     int x,y;                                                                                                \
3671                                                                                                             \
3672     for(y=1; y<h; y++){                                                                                     \
3673         for(x=0; x<size; x+=4){                                                                             \
3674             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3675                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3676         }                                                                                                   \
3677         s+= stride;                                                                                         \
3678     }                                                                                                       \
3679                                                                                                             \
3680     return score;                                                                                           \
3681 }
3682 VSAD_INTRA(8)
3683 VSAD_INTRA(16)
3684
3685 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3686     int score=0;
3687     int x,y;
3688
3689     for(y=1; y<h; y++){
3690         for(x=0; x<16; x++){
3691             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3692         }
3693         s1+= stride;
3694         s2+= stride;
3695     }
3696
3697     return score;
3698 }
3699
3700 #define SQ(a) ((a)*(a))
3701 #define VSSE_INTRA(size) \
3702 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3703     int score=0;                                                                                            \
3704     int x,y;                                                                                                \
3705                                                                                                             \
3706     for(y=1; y<h; y++){                                                                                     \
3707         for(x=0; x<size; x+=4){                                                                               \
3708             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3709                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3710         }                                                                                                   \
3711         s+= stride;                                                                                         \
3712     }                                                                                                       \
3713                                                                                                             \
3714     return score;                                                                                           \
3715 }
3716 VSSE_INTRA(8)
3717 VSSE_INTRA(16)
3718
3719 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3720     int score=0;
3721     int x,y;
3722
3723     for(y=1; y<h; y++){
3724         for(x=0; x<16; x++){
3725             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3726         }
3727         s1+= stride;
3728         s2+= stride;
3729     }
3730
3731     return score;
3732 }
3733
3734 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3735                                int size){
3736     int score=0;
3737     int i;
3738     for(i=0; i<size; i++)
3739         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3740     return score;
3741 }
3742
3743 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3744 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3745 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3746 #if CONFIG_GPL
3747 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3748 #endif
3749 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3750 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3751 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3752 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3753
3754 static void vector_fmul_c(float *dst, const float *src, int len){
3755     int i;
3756     for(i=0; i<len; i++)
3757         dst[i] *= src[i];
3758 }
3759
3760 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3761     int i;
3762     src1 += len-1;
3763     for(i=0; i<len; i++)
3764         dst[i] = src0[i] * src1[-i];
3765 }
3766
3767 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3768     int i;
3769     for(i=0; i<len; i++)
3770         dst[i] = src0[i] * src1[i] + src2[i];
3771 }
3772
3773 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3774     int i,j;
3775     dst += len;
3776     win += len;
3777     src0+= len;
3778     for(i=-len, j=len-1; i<0; i++, j--) {
3779         float s0 = src0[i];
3780         float s1 = src1[j];
3781         float wi = win[i];
3782         float wj = win[j];
3783         dst[i] = s0*wj - s1*wi + add_bias;
3784         dst[j] = s0*wi + s1*wj + add_bias;
3785     }
3786 }
3787
3788 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3789                                  int len)
3790 {
3791     int i;
3792     for (i = 0; i < len; i++)
3793         dst[i] = src[i] * mul;
3794 }
3795
3796 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3797                                       const float **sv, float mul, int len)
3798 {
3799     int i;
3800     for (i = 0; i < len; i += 2, sv++) {
3801         dst[i  ] = src[i  ] * sv[0][0] * mul;
3802         dst[i+1] = src[i+1] * sv[0][1] * mul;
3803     }
3804 }
3805
3806 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3807                                       const float **sv, float mul, int len)
3808 {
3809     int i;
3810     for (i = 0; i < len; i += 4, sv++) {
3811         dst[i  ] = src[i  ] * sv[0][0] * mul;
3812         dst[i+1] = src[i+1] * sv[0][1] * mul;
3813         dst[i+2] = src[i+2] * sv[0][2] * mul;
3814         dst[i+3] = src[i+3] * sv[0][3] * mul;
3815     }
3816 }
3817
3818 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3819                                int len)
3820 {
3821     int i;
3822     for (i = 0; i < len; i += 2, sv++) {
3823         dst[i  ] = sv[0][0] * mul;
3824         dst[i+1] = sv[0][1] * mul;
3825     }
3826 }
3827
3828 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3829                                int len)
3830 {
3831     int i;
3832     for (i = 0; i < len; i += 4, sv++) {
3833         dst[i  ] = sv[0][0] * mul;
3834         dst[i+1] = sv[0][1] * mul;
3835         dst[i+2] = sv[0][2] * mul;
3836         dst[i+3] = sv[0][3] * mul;
3837     }
3838 }
3839
3840 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3841                                 int len)
3842 {
3843     int i;
3844     for (i = 0; i < len; i++) {
3845         float t = v1[i] - v2[i];
3846         v1[i] += v2[i];
3847         v2[i] = t;
3848     }
3849 }
3850
3851 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3852 {
3853     float p = 0.0;
3854     int i;
3855
3856     for (i = 0; i < len; i++)
3857         p += v1[i] * v2[i];
3858
3859     return p;
3860 }
3861
3862 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3863     int i;
3864     for(i=0; i<len; i++)
3865         dst[i] = src[i] * mul;
3866 }
3867
3868 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3869                    uint32_t maxi, uint32_t maxisign)
3870 {
3871
3872     if(a > mini) return mini;
3873     else if((a^(1<<31)) > maxisign) return maxi;
3874     else return a;
3875 }
3876
3877 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3878     int i;
3879     uint32_t mini = *(uint32_t*)min;
3880     uint32_t maxi = *(uint32_t*)max;
3881     uint32_t maxisign = maxi ^ (1<<31);
3882     uint32_t *dsti = (uint32_t*)dst;
3883     const uint32_t *srci = (const uint32_t*)src;
3884     for(i=0; i<len; i+=8) {
3885         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3886         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3887         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3888         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3889         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3890         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3891         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3892         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3893     }
3894 }
3895 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3896     int i;
3897     if(min < 0 && max > 0) {
3898         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3899     } else {
3900         for(i=0; i < len; i+=8) {
3901             dst[i    ] = av_clipf(src[i    ], min, max);
3902             dst[i + 1] = av_clipf(src[i + 1], min, max);
3903             dst[i + 2] = av_clipf(src[i + 2], min, max);
3904             dst[i + 3] = av_clipf(src[i + 3], min, max);
3905             dst[i + 4] = av_clipf(src[i + 4], min, max);
3906             dst[i + 5] = av_clipf(src[i + 5], min, max);
3907             dst[i + 6] = av_clipf(src[i + 6], min, max);
3908             dst[i + 7] = av_clipf(src[i + 7], min, max);
3909         }
3910     }
3911 }
3912
3913 static av_always_inline int float_to_int16_one(const float *src){
3914     int_fast32_t tmp = *(const int32_t*)src;
3915     if(tmp & 0xf0000){
3916         tmp = (0x43c0ffff - tmp)>>31;
3917         // is this faster on some gcc/cpu combinations?
3918 //      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3919 //      else                 tmp = 0;
3920     }
3921     return tmp - 0x8000;
3922 }
3923
3924 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3925     int i;
3926     for(i=0; i<len; i++)
3927         dst[i] = float_to_int16_one(src+i);
3928 }
3929
3930 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3931     int i,j,c;
3932     if(channels==2){
3933         for(i=0; i<len; i++){
3934             dst[2*i]   = float_to_int16_one(src[0]+i);
3935             dst[2*i+1] = float_to_int16_one(src[1]+i);
3936         }
3937     }else{
3938         for(c=0; c<channels; c++)
3939             for(i=0, j=c; i<len; i++, j+=channels)
3940                 dst[j] = float_to_int16_one(src[c]+i);
3941     }
3942 }
3943
3944 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3945 {
3946     int res = 0;
3947
3948     while (order--)
3949         res += (*v1++ * *v2++) >> shift;
3950
3951     return res;
3952 }
3953
3954 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3955 {
3956     int res = 0;
3957     while (order--) {
3958         res   += *v1 * *v2++;
3959         *v1++ += mul * *v3++;
3960     }
3961     return res;
3962 }
3963
3964 #define W0 2048
3965 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3966 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3967 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3968 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3969 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3970 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3971 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3972
3973 static void wmv2_idct_row(short * b)
3974 {
3975     int s1,s2;
3976     int a0,a1,a2,a3,a4,a5,a6,a7;
3977     /*step 1*/
3978     a1 = W1*b[1]+W7*b[7];
3979     a7 = W7*b[1]-W1*b[7];
3980     a5 = W5*b[5]+W3*b[3];
3981     a3 = W3*b[5]-W5*b[3];
3982     a2 = W2*b[2]+W6*b[6];
3983     a6 = W6*b[2]-W2*b[6];
3984     a0 = W0*b[0]+W0*b[4];
3985     a4 = W0*b[0]-W0*b[4];
3986     /*step 2*/
3987     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3988     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3989     /*step 3*/
3990     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3991     b[1] = (a4+a6 +s1   + (1<<7))>>8;
3992     b[2] = (a4-a6 +s2   + (1<<7))>>8;
3993     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3994     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3995     b[5] = (a4-a6 -s2   + (1<<7))>>8;
3996     b[6] = (a4+a6 -s1   + (1<<7))>>8;
3997     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3998 }
3999 static void wmv2_idct_col(short * b)
4000 {
4001     int s1,s2;
4002     int a0,a1,a2,a3,a4,a5,a6,a7;
4003     /*step 1, with extended precision*/
4004     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4005     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4006     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4007     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4008     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4009     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4010     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4011     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4012     /*step 2*/
4013     s1 = (181*(a1-a5+a7-a3)+128)>>8;
4014     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4015     /*step 3*/
4016     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4017     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4018     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4019     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4020
4021     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4022     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4023     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4024     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4025 }
4026 void ff_wmv2_idct_c(short * block){
4027     int i;
4028
4029     for(i=0;i<64;i+=8){
4030         wmv2_idct_row(block+i);
4031     }
4032     for(i=0;i<8;i++){
4033         wmv2_idct_col(block+i);
4034     }
4035 }
4036 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4037  converted */
4038 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4039 {
4040     ff_wmv2_idct_c(block);
4041     put_pixels_clamped_c(block, dest, line_size);
4042 }
4043 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4044 {
4045     ff_wmv2_idct_c(block);
4046     add_pixels_clamped_c(block, dest, line_size);
4047 }
4048 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4049 {
4050     j_rev_dct (block);
4051     put_pixels_clamped_c(block, dest, line_size);
4052 }
4053 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4054 {
4055     j_rev_dct (block);
4056     add_pixels_clamped_c(block, dest, line_size);
4057 }
4058
4059 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4060 {
4061     j_rev_dct4 (block);
4062     put_pixels_clamped4_c(block, dest, line_size);
4063 }
4064 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4065 {
4066     j_rev_dct4 (block);
4067     add_pixels_clamped4_c(block, dest, line_size);
4068 }
4069
4070 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4071 {
4072     j_rev_dct2 (block);
4073     put_pixels_clamped2_c(block, dest, line_size);
4074 }
4075 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4076 {
4077     j_rev_dct2 (block);
4078     add_pixels_clamped2_c(block, dest, line_size);
4079 }
4080
4081 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4082 {
4083     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4084
4085     dest[0] = cm[(block[0] + 4)>>3];
4086 }
4087 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4088 {
4089     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4090
4091     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4092 }
4093
4094 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4095
4096 /* init static data */
4097 av_cold void dsputil_static_init(void)
4098 {
4099     int i;
4100
4101     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4102     for(i=0;i<MAX_NEG_CROP;i++) {
4103         ff_cropTbl[i] = 0;
4104         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4105     }
4106
4107     for(i=0;i<512;i++) {
4108         ff_squareTbl[i] = (i - 256) * (i - 256);
4109     }
4110
4111     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4112 }
4113
4114 int ff_check_alignment(void){
4115     static int did_fail=0;
4116     DECLARE_ALIGNED(16, int, aligned);
4117
4118     if((intptr_t)&aligned & 15){
4119         if(!did_fail){
4120 #if HAVE_MMX || HAVE_ALTIVEC
4121             av_log(NULL, AV_LOG_ERROR,
4122                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4123                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4124                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4125                 "Do not report crashes to FFmpeg developers.\n");
4126 #endif
4127             did_fail=1;
4128         }
4129         return -1;
4130     }
4131     return 0;
4132 }
4133
4134 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4135 {
4136     int i;
4137
4138     ff_check_alignment();
4139
4140 #if CONFIG_ENCODERS
4141     if(avctx->dct_algo==FF_DCT_FASTINT) {
4142         c->fdct = fdct_ifast;
4143         c->fdct248 = fdct_ifast248;
4144     }
4145     else if(avctx->dct_algo==FF_DCT_FAAN) {
4146         c->fdct = ff_faandct;
4147         c->fdct248 = ff_faandct248;
4148     }
4149     else {
4150         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4151         c->fdct248 = ff_fdct248_islow;
4152     }
4153 #endif //CONFIG_ENCODERS
4154
4155     if(avctx->lowres==1){
4156         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4157             c->idct_put= ff_jref_idct4_put;
4158             c->idct_add= ff_jref_idct4_add;
4159         }else{
4160             c->idct_put= ff_h264_lowres_idct_put_c;
4161             c->idct_add= ff_h264_lowres_idct_add_c;
4162         }
4163         c->idct    = j_rev_dct4;
4164         c->idct_permutation_type= FF_NO_IDCT_PERM;
4165     }else if(avctx->lowres==2){
4166         c->idct_put= ff_jref_idct2_put;
4167         c->idct_add= ff_jref_idct2_add;
4168         c->idct    = j_rev_dct2;
4169         c->idct_permutation_type= FF_NO_IDCT_PERM;
4170     }else if(avctx->lowres==3){
4171         c->idct_put= ff_jref_idct1_put;
4172         c->idct_add= ff_jref_idct1_add;
4173         c->idct    = j_rev_dct1;
4174         c->idct_permutation_type= FF_NO_IDCT_PERM;
4175     }else{
4176         if(avctx->idct_algo==FF_IDCT_INT){
4177             c->idct_put= ff_jref_idct_put;
4178             c->idct_add= ff_jref_idct_add;
4179             c->idct    = j_rev_dct;
4180             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4181         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4182                 avctx->idct_algo==FF_IDCT_VP3){
4183             c->idct_put= ff_vp3_idct_put_c;
4184             c->idct_add= ff_vp3_idct_add_c;
4185             c->idct    = ff_vp3_idct_c;
4186             c->idct_permutation_type= FF_NO_IDCT_PERM;
4187         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4188             c->idct_put= ff_wmv2_idct_put_c;
4189             c->idct_add= ff_wmv2_idct_add_c;
4190             c->idct    = ff_wmv2_idct_c;
4191             c->idct_permutation_type= FF_NO_IDCT_PERM;
4192         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4193             c->idct_put= ff_faanidct_put;
4194             c->idct_add= ff_faanidct_add;
4195             c->idct    = ff_faanidct;
4196             c->idct_permutation_type= FF_NO_IDCT_PERM;
4197         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4198             c->idct_put= ff_ea_idct_put_c;
4199             c->idct_permutation_type= FF_NO_IDCT_PERM;
4200         }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4201             c->idct     = ff_bink_idct_c;
4202             c->idct_add = ff_bink_idct_add_c;
4203             c->idct_put = ff_bink_idct_put_c;
4204             c->idct_permutation_type = FF_NO_IDCT_PERM;
4205         }else{ //accurate/default
4206             c->idct_put= ff_simple_idct_put;
4207             c->idct_add= ff_simple_idct_add;
4208             c->idct    = ff_simple_idct;
4209             c->idct_permutation_type= FF_NO_IDCT_PERM;
4210         }
4211     }
4212
4213     c->get_pixels = get_pixels_c;
4214     c->diff_pixels = diff_pixels_c;
4215     c->put_pixels_clamped = put_pixels_clamped_c;
4216     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4217     c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4218     c->add_pixels_clamped = add_pixels_clamped_c;
4219     c->add_pixels8 = add_pixels8_c;
4220     c->add_pixels4 = add_pixels4_c;
4221     c->sum_abs_dctelem = sum_abs_dctelem_c;
4222     c->gmc1 = gmc1_c;
4223     c->gmc = ff_gmc_c;
4224     c->clear_block = clear_block_c;
4225     c->clear_blocks = clear_blocks_c;
4226     c->pix_sum = pix_sum_c;
4227     c->pix_norm1 = pix_norm1_c;
4228
4229     c->fill_block_tab[0] = fill_block16_c;
4230     c->fill_block_tab[1] = fill_block8_c;
4231     c->scale_block = scale_block_c;
4232
4233     /* TODO [0] 16  [1] 8 */
4234     c->pix_abs[0][0] = pix_abs16_c;
4235     c->pix_abs[0][1] = pix_abs16_x2_c;
4236     c->pix_abs[0][2] = pix_abs16_y2_c;
4237     c->pix_abs[0][3] = pix_abs16_xy2_c;
4238     c->pix_abs[1][0] = pix_abs8_c;
4239     c->pix_abs[1][1] = pix_abs8_x2_c;
4240     c->pix_abs[1][2] = pix_abs8_y2_c;
4241     c->pix_abs[1][3] = pix_abs8_xy2_c;
4242
4243 #define dspfunc(PFX, IDX, NUM) \
4244     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4245     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4246     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4247     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4248
4249     dspfunc(put, 0, 16);
4250     dspfunc(put_no_rnd, 0, 16);
4251     dspfunc(put, 1, 8);
4252     dspfunc(put_no_rnd, 1, 8);
4253     dspfunc(put, 2, 4);
4254     dspfunc(put, 3, 2);
4255
4256     dspfunc(avg, 0, 16);
4257     dspfunc(avg_no_rnd, 0, 16);
4258     dspfunc(avg, 1, 8);
4259     dspfunc(avg_no_rnd, 1, 8);
4260     dspfunc(avg, 2, 4);
4261     dspfunc(avg, 3, 2);
4262 #undef dspfunc
4263
4264     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4265     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4266
4267     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4268     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4269     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4270     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4271     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4272     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4273     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4274     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4275     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4276
4277     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4278     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4279     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4280     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4281     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4282     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4283     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4284     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4285     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4286
4287 #define dspfunc(PFX, IDX, NUM) \
4288     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4289     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4290     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4291     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4292     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4293     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4294     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4295     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4296     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4297     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4298     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4299     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4300     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4301     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4302     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4303     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4304
4305     dspfunc(put_qpel, 0, 16);
4306     dspfunc(put_no_rnd_qpel, 0, 16);
4307
4308     dspfunc(avg_qpel, 0, 16);
4309     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4310
4311     dspfunc(put_qpel, 1, 8);
4312     dspfunc(put_no_rnd_qpel, 1, 8);
4313
4314     dspfunc(avg_qpel, 1, 8);
4315     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4316
4317     dspfunc(put_h264_qpel, 0, 16);
4318     dspfunc(put_h264_qpel, 1, 8);
4319     dspfunc(put_h264_qpel, 2, 4);
4320     dspfunc(put_h264_qpel, 3, 2);
4321     dspfunc(avg_h264_qpel, 0, 16);
4322     dspfunc(avg_h264_qpel, 1, 8);
4323     dspfunc(avg_h264_qpel, 2, 4);
4324
4325 #undef dspfunc
4326     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4327     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4328     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4329     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4330     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4331     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4332     c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4333     c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4334
4335     c->draw_edges = draw_edges_c;
4336
4337 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4338     ff_mlp_init(c, avctx);
4339 #endif
4340 #if CONFIG_VC1_DECODER
4341     ff_vc1dsp_init(c,avctx);
4342 #endif
4343 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4344     ff_intrax8dsp_init(c,avctx);
4345 #endif
4346 #if CONFIG_RV30_DECODER
4347     ff_rv30dsp_init(c,avctx);
4348 #endif
4349 #if CONFIG_RV40_DECODER
4350     ff_rv40dsp_init(c,avctx);
4351     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4352     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4353     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4354     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4355 #endif
4356
4357     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4358     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4359     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4360     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4361     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4362     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4363     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4364     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4365
4366 #define SET_CMP_FUNC(name) \
4367     c->name[0]= name ## 16_c;\
4368     c->name[1]= name ## 8x8_c;
4369
4370     SET_CMP_FUNC(hadamard8_diff)
4371     c->hadamard8_diff[4]= hadamard8_intra16_c;
4372     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4373     SET_CMP_FUNC(dct_sad)
4374     SET_CMP_FUNC(dct_max)
4375 #if CONFIG_GPL
4376     SET_CMP_FUNC(dct264_sad)
4377 #endif
4378     c->sad[0]= pix_abs16_c;
4379     c->sad[1]= pix_abs8_c;
4380     c->sse[0]= sse16_c;
4381     c->sse[1]= sse8_c;
4382     c->sse[2]= sse4_c;
4383     SET_CMP_FUNC(quant_psnr)
4384     SET_CMP_FUNC(rd)
4385     SET_CMP_FUNC(bit)
4386     c->vsad[0]= vsad16_c;
4387     c->vsad[4]= vsad_intra16_c;
4388     c->vsad[5]= vsad_intra8_c;
4389     c->vsse[0]= vsse16_c;
4390     c->vsse[4]= vsse_intra16_c;
4391     c->vsse[5]= vsse_intra8_c;
4392     c->nsse[0]= nsse16_c;
4393     c->nsse[1]= nsse8_c;
4394 #if CONFIG_DWT
4395     ff_dsputil_init_dwt(c);
4396 #endif
4397
4398     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4399
4400     c->add_bytes= add_bytes_c;
4401     c->add_bytes_l2= add_bytes_l2_c;
4402     c->diff_bytes= diff_bytes_c;
4403     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4404     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4405     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
4406     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4407     c->bswap_buf= bswap_buf;
4408 #if CONFIG_PNG_DECODER
4409     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4410 #endif
4411
4412     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4413         c->h263_h_loop_filter= h263_h_loop_filter_c;
4414         c->h263_v_loop_filter= h263_v_loop_filter_c;
4415     }
4416
4417     if (CONFIG_VP3_DECODER) {
4418         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4419         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4420         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4421     }
4422
4423     c->h261_loop_filter= h261_loop_filter_c;
4424
4425     c->try_8x8basis= try_8x8basis_c;
4426     c->add_8x8basis= add_8x8basis_c;
4427
4428 #if CONFIG_VORBIS_DECODER
4429     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4430 #endif
4431 #if CONFIG_AC3_DECODER
4432     c->ac3_downmix = ff_ac3_downmix_c;
4433 #endif
4434 #if CONFIG_LPC
4435     c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
4436 #endif
4437     c->vector_fmul = vector_fmul_c;
4438     c->vector_fmul_reverse = vector_fmul_reverse_c;
4439     c->vector_fmul_add = vector_fmul_add_c;
4440     c->vector_fmul_window = ff_vector_fmul_window_c;
4441     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4442     c->vector_clipf = vector_clipf_c;
4443     c->float_to_int16 = ff_float_to_int16_c;
4444     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4445     c->scalarproduct_int16 = scalarproduct_int16_c;
4446     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4447     c->scalarproduct_float = scalarproduct_float_c;
4448     c->butterflies_float = butterflies_float_c;
4449     c->vector_fmul_scalar = vector_fmul_scalar_c;
4450
4451     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4452     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4453
4454     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4455     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4456
4457     c->shrink[0]= av_image_copy_plane;
4458     c->shrink[1]= ff_shrink22;
4459     c->shrink[2]= ff_shrink44;
4460     c->shrink[3]= ff_shrink88;
4461
4462     c->prefetch= just_return;
4463
4464     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4465     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4466
4467     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4468     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4469     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4470     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4471     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4472     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4473     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4474     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4475     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4476
4477     for(i=0; i<64; i++){
4478         if(!c->put_2tap_qpel_pixels_tab[0][i])
4479             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4480         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4481             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4482     }
4483
4484     c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4485     c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4486     c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4487     c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4488
4489     c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4490     c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4491     c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4492     c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4493
4494     switch(c->idct_permutation_type){
4495     case FF_NO_IDCT_PERM:
4496         for(i=0; i<64; i++)
4497             c->idct_permutation[i]= i;
4498         break;
4499     case FF_LIBMPEG2_IDCT_PERM:
4500         for(i=0; i<64; i++)
4501             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4502         break;
4503     case FF_SIMPLE_IDCT_PERM:
4504         for(i=0; i<64; i++)
4505             c->idct_permutation[i]= simple_mmx_permutation[i];
4506         break;
4507     case FF_TRANSPOSE_IDCT_PERM:
4508         for(i=0; i<64; i++)
4509             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4510         break;
4511     case FF_PARTTRANS_IDCT_PERM:
4512         for(i=0; i<64; i++)
4513             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4514         break;
4515     case FF_SSE2_IDCT_PERM:
4516         for(i=0; i<64; i++)
4517             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4518         break;
4519     default:
4520         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4521     }
4522 }
4523