git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41 #include "png.h"
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  47 #define pb_7f (~0UL/255 * 0x7f)
  48 #define pb_80 (~0UL/255 * 0x80)
  49
  50 const uint8_t ff_zigzag_direct[64] = {
  51     0,   1,  8, 16,  9,  2,  3, 10,
  52     17, 24, 32, 25, 18, 11,  4,  5,
  53     12, 19, 26, 33, 40, 48, 41, 34,
  54     27, 20, 13,  6,  7, 14, 21, 28,
  55     35, 42, 49, 56, 57, 50, 43, 36,
  56     29, 22, 15, 23, 30, 37, 44, 51,
  57     58, 59, 52, 45, 38, 31, 39, 46,
  58     53, 60, 61, 54, 47, 55, 62, 63
  59 };
  60
  61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  62    specification, we interleave the fields */
  63 const uint8_t ff_zigzag248_direct[64] = {
  64      0,  8,  1,  9, 16, 24,  2, 10,
  65     17, 25, 32, 40, 48, 56, 33, 41,
  66     18, 26,  3, 11,  4, 12, 19, 27,
  67     34, 42, 49, 57, 50, 58, 35, 43,
  68     20, 28,  5, 13,  6, 14, 21, 29,
  69     36, 44, 51, 59, 52, 60, 37, 45,
  70     22, 30,  7, 15, 23, 31, 38, 46,
  71     53, 61, 54, 62, 39, 47, 55, 63,
  72 };
  73
  74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  75 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  76
  77 const uint8_t ff_alternate_horizontal_scan[64] = {
  78     0,  1,   2,  3,  8,  9, 16, 17,
  79     10, 11,  4,  5,  6,  7, 15, 14,
  80     13, 12, 19, 18, 24, 25, 32, 33,
  81     26, 27, 20, 21, 22, 23, 28, 29,
  82     30, 31, 34, 35, 40, 41, 48, 49,
  83     42, 43, 36, 37, 38, 39, 44, 45,
  84     46, 47, 50, 51, 56, 57, 58, 59,
  85     52, 53, 54, 55, 60, 61, 62, 63,
  86 };
  87
  88 const uint8_t ff_alternate_vertical_scan[64] = {
  89     0,  8,  16, 24,  1,  9,  2, 10,
  90     17, 25, 32, 40, 48, 56, 57, 49,
  91     41, 33, 26, 18,  3, 11,  4, 12,
  92     19, 27, 34, 42, 50, 58, 35, 43,
  93     51, 59, 20, 28,  5, 13,  6, 14,
  94     21, 29, 36, 44, 52, 60, 37, 45,
  95     53, 61, 22, 30,  7, 15, 23, 31,
  96     38, 46, 54, 62, 39, 47, 55, 63,
  97 };
  98
  99 /* Input permutation for the simple_idct_mmx */
 100 static const uint8_t simple_mmx_permutation[64]={
 101         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 102         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 103         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 104         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 105         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 106         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 107         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 108         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 109 };
 110
 111 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 112
 113 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 114     int i;
 115     int end;
 116
 117     st->scantable= src_scantable;
 118
 119     for(i=0; i<64; i++){
 120         int j;
 121         j = src_scantable[i];
 122         st->permutated[i] = permutation[j];
 123 #if ARCH_PPC
 124         st->inverse[j] = i;
 125 #endif
 126     }
 127
 128     end=-1;
 129     for(i=0; i<64; i++){
 130         int j;
 131         j = st->permutated[i];
 132         if(j>end) end=j;
 133         st->raster_end[i]= end;
 134     }
 135 }
 136
 137 static int pix_sum_c(uint8_t * pix, int line_size)
 138 {
 139     int s, i, j;
 140
 141     s = 0;
 142     for (i = 0; i < 16; i++) {
 143         for (j = 0; j < 16; j += 8) {
 144             s += pix[0];
 145             s += pix[1];
 146             s += pix[2];
 147             s += pix[3];
 148             s += pix[4];
 149             s += pix[5];
 150             s += pix[6];
 151             s += pix[7];
 152             pix += 8;
 153         }
 154         pix += line_size - 16;
 155     }
 156     return s;
 157 }
 158
 159 static int pix_norm1_c(uint8_t * pix, int line_size)
 160 {
 161     int s, i, j;
 162     uint32_t *sq = ff_squareTbl + 256;
 163
 164     s = 0;
 165     for (i = 0; i < 16; i++) {
 166         for (j = 0; j < 16; j += 8) {
 167 #if 0
 168             s += sq[pix[0]];
 169             s += sq[pix[1]];
 170             s += sq[pix[2]];
 171             s += sq[pix[3]];
 172             s += sq[pix[4]];
 173             s += sq[pix[5]];
 174             s += sq[pix[6]];
 175             s += sq[pix[7]];
 176 #else
 177 #if LONG_MAX > 2147483647
 178             register uint64_t x=*(uint64_t*)pix;
 179             s += sq[x&0xff];
 180             s += sq[(x>>8)&0xff];
 181             s += sq[(x>>16)&0xff];
 182             s += sq[(x>>24)&0xff];
 183             s += sq[(x>>32)&0xff];
 184             s += sq[(x>>40)&0xff];
 185             s += sq[(x>>48)&0xff];
 186             s += sq[(x>>56)&0xff];
 187 #else
 188             register uint32_t x=*(uint32_t*)pix;
 189             s += sq[x&0xff];
 190             s += sq[(x>>8)&0xff];
 191             s += sq[(x>>16)&0xff];
 192             s += sq[(x>>24)&0xff];
 193             x=*(uint32_t*)(pix+4);
 194             s += sq[x&0xff];
 195             s += sq[(x>>8)&0xff];
 196             s += sq[(x>>16)&0xff];
 197             s += sq[(x>>24)&0xff];
 198 #endif
 199 #endif
 200             pix += 8;
 201         }
 202         pix += line_size - 16;
 203     }
 204     return s;
 205 }
 206
 207 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 208     int i;
 209
 210     for(i=0; i+8<=w; i+=8){
 211         dst[i+0]= av_bswap32(src[i+0]);
 212         dst[i+1]= av_bswap32(src[i+1]);
 213         dst[i+2]= av_bswap32(src[i+2]);
 214         dst[i+3]= av_bswap32(src[i+3]);
 215         dst[i+4]= av_bswap32(src[i+4]);
 216         dst[i+5]= av_bswap32(src[i+5]);
 217         dst[i+6]= av_bswap32(src[i+6]);
 218         dst[i+7]= av_bswap32(src[i+7]);
 219     }
 220     for(;i<w; i++){
 221         dst[i+0]= av_bswap32(src[i+0]);
 222     }
 223 }
 224
 225 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 226 {
 227     while (len--)
 228         *dst++ = av_bswap16(*src++);
 229 }
 230
 231 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 232 {
 233     int s, i;
 234     uint32_t *sq = ff_squareTbl + 256;
 235
 236     s = 0;
 237     for (i = 0; i < h; i++) {
 238         s += sq[pix1[0] - pix2[0]];
 239         s += sq[pix1[1] - pix2[1]];
 240         s += sq[pix1[2] - pix2[2]];
 241         s += sq[pix1[3] - pix2[3]];
 242         pix1 += line_size;
 243         pix2 += line_size;
 244     }
 245     return s;
 246 }
 247
 248 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 249 {
 250     int s, i;
 251     uint32_t *sq = ff_squareTbl + 256;
 252
 253     s = 0;
 254     for (i = 0; i < h; i++) {
 255         s += sq[pix1[0] - pix2[0]];
 256         s += sq[pix1[1] - pix2[1]];
 257         s += sq[pix1[2] - pix2[2]];
 258         s += sq[pix1[3] - pix2[3]];
 259         s += sq[pix1[4] - pix2[4]];
 260         s += sq[pix1[5] - pix2[5]];
 261         s += sq[pix1[6] - pix2[6]];
 262         s += sq[pix1[7] - pix2[7]];
 263         pix1 += line_size;
 264         pix2 += line_size;
 265     }
 266     return s;
 267 }
 268
 269 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 270 {
 271     int s, i;
 272     uint32_t *sq = ff_squareTbl + 256;
 273
 274     s = 0;
 275     for (i = 0; i < h; i++) {
 276         s += sq[pix1[ 0] - pix2[ 0]];
 277         s += sq[pix1[ 1] - pix2[ 1]];
 278         s += sq[pix1[ 2] - pix2[ 2]];
 279         s += sq[pix1[ 3] - pix2[ 3]];
 280         s += sq[pix1[ 4] - pix2[ 4]];
 281         s += sq[pix1[ 5] - pix2[ 5]];
 282         s += sq[pix1[ 6] - pix2[ 6]];
 283         s += sq[pix1[ 7] - pix2[ 7]];
 284         s += sq[pix1[ 8] - pix2[ 8]];
 285         s += sq[pix1[ 9] - pix2[ 9]];
 286         s += sq[pix1[10] - pix2[10]];
 287         s += sq[pix1[11] - pix2[11]];
 288         s += sq[pix1[12] - pix2[12]];
 289         s += sq[pix1[13] - pix2[13]];
 290         s += sq[pix1[14] - pix2[14]];
 291         s += sq[pix1[15] - pix2[15]];
 292
 293         pix1 += line_size;
 294         pix2 += line_size;
 295     }
 296     return s;
 297 }
 298
 299 /* draw the edges of width 'w' of an image of size width, height */
 300 //FIXME check that this is ok for mpeg4 interlaced
 301 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w, int sides)
 302 {
 303     uint8_t *ptr, *last_line;
 304     int i;
 305
 306     /* left and right */
 307     ptr = buf;
 308     for(i=0;i<height;i++) {
 309         memset(ptr - w, ptr[0], w);
 310         memset(ptr + width, ptr[width-1], w);
 311         ptr += wrap;
 312     }
 313
 314     /* top and bottom + corners */
 315     buf -= w;
 316     last_line = buf + (height - 1) * wrap;
 317     if (sides & EDGE_TOP)
 318         for(i = 0; i < w; i++)
 319             memcpy(buf - (i + 1) * wrap, buf, width + w + w); // top
 320     if (sides & EDGE_BOTTOM)
 321         for (i = 0; i < w; i++)
 322             memcpy(last_line + (i + 1) * wrap, last_line, width + w + w); // bottom
 323 }
 324
 325 /**
 326  * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
 327  * @param buf destination buffer
 328  * @param src source buffer
 329  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 330  * @param block_w width of block
 331  * @param block_h height of block
 332  * @param src_x x coordinate of the top left sample of the block in the source buffer
 333  * @param src_y y coordinate of the top left sample of the block in the source buffer
 334  * @param w width of the source buffer
 335  * @param h height of the source buffer
 336  */
 337 void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
 338                                     int src_x, int src_y, int w, int h){
 339     int x, y;
 340     int start_y, start_x, end_y, end_x;
 341
 342     if(src_y>= h){
 343         src+= (h-1-src_y)*linesize;
 344         src_y=h-1;
 345     }else if(src_y<=-block_h){
 346         src+= (1-block_h-src_y)*linesize;
 347         src_y=1-block_h;
 348     }
 349     if(src_x>= w){
 350         src+= (w-1-src_x);
 351         src_x=w-1;
 352     }else if(src_x<=-block_w){
 353         src+= (1-block_w-src_x);
 354         src_x=1-block_w;
 355     }
 356
 357     start_y= FFMAX(0, -src_y);
 358     start_x= FFMAX(0, -src_x);
 359     end_y= FFMIN(block_h, h-src_y);
 360     end_x= FFMIN(block_w, w-src_x);
 361     assert(start_y < end_y && block_h);
 362     assert(start_x < end_x && block_w);
 363
 364     w    = end_x - start_x;
 365     src += start_y*linesize + start_x;
 366     buf += start_x;
 367
 368     //top
 369     for(y=0; y<start_y; y++){
 370         memcpy(buf, src, w);
 371         buf += linesize;
 372     }
 373
 374     // copy existing part
 375     for(; y<end_y; y++){
 376         memcpy(buf, src, w);
 377         src += linesize;
 378         buf += linesize;
 379     }
 380
 381     //bottom
 382     src -= linesize;
 383     for(; y<block_h; y++){
 384         memcpy(buf, src, w);
 385         buf += linesize;
 386     }
 387
 388     buf -= block_h * linesize + start_x;
 389     while (block_h--){
 390        //left
 391         for(x=0; x<start_x; x++){
 392             buf[x] = buf[start_x];
 393         }
 394
 395        //right
 396         for(x=end_x; x<block_w; x++){
 397             buf[x] = buf[end_x - 1];
 398         }
 399         buf += linesize;
 400     }
 401 }
 402
 403 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 404 {
 405     int i;
 406
 407     /* read the pixels */
 408     for(i=0;i<8;i++) {
 409         block[0] = pixels[0];
 410         block[1] = pixels[1];
 411         block[2] = pixels[2];
 412         block[3] = pixels[3];
 413         block[4] = pixels[4];
 414         block[5] = pixels[5];
 415         block[6] = pixels[6];
 416         block[7] = pixels[7];
 417         pixels += line_size;
 418         block += 8;
 419     }
 420 }
 421
 422 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 423                           const uint8_t *s2, int stride){
 424     int i;
 425
 426     /* read the pixels */
 427     for(i=0;i<8;i++) {
 428         block[0] = s1[0] - s2[0];
 429         block[1] = s1[1] - s2[1];
 430         block[2] = s1[2] - s2[2];
 431         block[3] = s1[3] - s2[3];
 432         block[4] = s1[4] - s2[4];
 433         block[5] = s1[5] - s2[5];
 434         block[6] = s1[6] - s2[6];
 435         block[7] = s1[7] - s2[7];
 436         s1 += stride;
 437         s2 += stride;
 438         block += 8;
 439     }
 440 }
 441
 442
 443 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 444                              int line_size)
 445 {
 446     int i;
 447     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 448
 449     /* read the pixels */
 450     for(i=0;i<8;i++) {
 451         pixels[0] = cm[block[0]];
 452         pixels[1] = cm[block[1]];
 453         pixels[2] = cm[block[2]];
 454         pixels[3] = cm[block[3]];
 455         pixels[4] = cm[block[4]];
 456         pixels[5] = cm[block[5]];
 457         pixels[6] = cm[block[6]];
 458         pixels[7] = cm[block[7]];
 459
 460         pixels += line_size;
 461         block += 8;
 462     }
 463 }
 464
 465 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 466                                  int line_size)
 467 {
 468     int i;
 469     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 470
 471     /* read the pixels */
 472     for(i=0;i<4;i++) {
 473         pixels[0] = cm[block[0]];
 474         pixels[1] = cm[block[1]];
 475         pixels[2] = cm[block[2]];
 476         pixels[3] = cm[block[3]];
 477
 478         pixels += line_size;
 479         block += 8;
 480     }
 481 }
 482
 483 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 484                                  int line_size)
 485 {
 486     int i;
 487     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 488
 489     /* read the pixels */
 490     for(i=0;i<2;i++) {
 491         pixels[0] = cm[block[0]];
 492         pixels[1] = cm[block[1]];
 493
 494         pixels += line_size;
 495         block += 8;
 496     }
 497 }
 498
 499 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
 500                                     uint8_t *restrict pixels,
 501                                     int line_size)
 502 {
 503     int i, j;
 504
 505     for (i = 0; i < 8; i++) {
 506         for (j = 0; j < 8; j++) {
 507             if (*block < -128)
 508                 *pixels = 0;
 509             else if (*block > 127)
 510                 *pixels = 255;
 511             else
 512                 *pixels = (uint8_t)(*block + 128);
 513             block++;
 514             pixels++;
 515         }
 516         pixels += (line_size - 8);
 517     }
 518 }
 519
 520 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 521                                     int line_size)
 522 {
 523     int i;
 524
 525     /* read the pixels */
 526     for(i=0;i<8;i++) {
 527         pixels[0] = block[0];
 528         pixels[1] = block[1];
 529         pixels[2] = block[2];
 530         pixels[3] = block[3];
 531         pixels[4] = block[4];
 532         pixels[5] = block[5];
 533         pixels[6] = block[6];
 534         pixels[7] = block[7];
 535
 536         pixels += line_size;
 537         block += 8;
 538     }
 539 }
 540
 541 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 542                              int line_size)
 543 {
 544     int i;
 545     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 546
 547     /* read the pixels */
 548     for(i=0;i<8;i++) {
 549         pixels[0] = cm[pixels[0] + block[0]];
 550         pixels[1] = cm[pixels[1] + block[1]];
 551         pixels[2] = cm[pixels[2] + block[2]];
 552         pixels[3] = cm[pixels[3] + block[3]];
 553         pixels[4] = cm[pixels[4] + block[4]];
 554         pixels[5] = cm[pixels[5] + block[5]];
 555         pixels[6] = cm[pixels[6] + block[6]];
 556         pixels[7] = cm[pixels[7] + block[7]];
 557         pixels += line_size;
 558         block += 8;
 559     }
 560 }
 561
 562 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 563                           int line_size)
 564 {
 565     int i;
 566     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 567
 568     /* read the pixels */
 569     for(i=0;i<4;i++) {
 570         pixels[0] = cm[pixels[0] + block[0]];
 571         pixels[1] = cm[pixels[1] + block[1]];
 572         pixels[2] = cm[pixels[2] + block[2]];
 573         pixels[3] = cm[pixels[3] + block[3]];
 574         pixels += line_size;
 575         block += 8;
 576     }
 577 }
 578
 579 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 580                           int line_size)
 581 {
 582     int i;
 583     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 584
 585     /* read the pixels */
 586     for(i=0;i<2;i++) {
 587         pixels[0] = cm[pixels[0] + block[0]];
 588         pixels[1] = cm[pixels[1] + block[1]];
 589         pixels += line_size;
 590         block += 8;
 591     }
 592 }
 593
 594 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 595 {
 596     int i;
 597     for(i=0;i<8;i++) {
 598         pixels[0] += block[0];
 599         pixels[1] += block[1];
 600         pixels[2] += block[2];
 601         pixels[3] += block[3];
 602         pixels[4] += block[4];
 603         pixels[5] += block[5];
 604         pixels[6] += block[6];
 605         pixels[7] += block[7];
 606         pixels += line_size;
 607         block += 8;
 608     }
 609 }
 610
 611 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 612 {
 613     int i;
 614     for(i=0;i<4;i++) {
 615         pixels[0] += block[0];
 616         pixels[1] += block[1];
 617         pixels[2] += block[2];
 618         pixels[3] += block[3];
 619         pixels += line_size;
 620         block += 4;
 621     }
 622 }
 623
 624 static int sum_abs_dctelem_c(DCTELEM *block)
 625 {
 626     int sum=0, i;
 627     for(i=0; i<64; i++)
 628         sum+= FFABS(block[i]);
 629     return sum;
 630 }
 631
 632 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 633 {
 634     int i;
 635
 636     for (i = 0; i < h; i++) {
 637         memset(block, value, 16);
 638         block += line_size;
 639     }
 640 }
 641
 642 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 643 {
 644     int i;
 645
 646     for (i = 0; i < h; i++) {
 647         memset(block, value, 8);
 648         block += line_size;
 649     }
 650 }
 651
 652 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
 653 {
 654     int i, j;
 655     uint16_t *dst1 = (uint16_t *) dst;
 656     uint16_t *dst2 = (uint16_t *)(dst + linesize);
 657
 658     for (j = 0; j < 8; j++) {
 659         for (i = 0; i < 8; i++) {
 660             dst1[i] = dst2[i] = src[i] * 0x0101;
 661         }
 662         src  += 8;
 663         dst1 += linesize;
 664         dst2 += linesize;
 665     }
 666 }
 667
 668 #if 0
 669
 670 #define PIXOP2(OPNAME, OP) \
 671 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 672 {\
 673     int i;\
 674     for(i=0; i<h; i++){\
 675         OP(*((uint64_t*)block), AV_RN64(pixels));\
 676         pixels+=line_size;\
 677         block +=line_size;\
 678     }\
 679 }\
 680 \
 681 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 682 {\
 683     int i;\
 684     for(i=0; i<h; i++){\
 685         const uint64_t a= AV_RN64(pixels  );\
 686         const uint64_t b= AV_RN64(pixels+1);\
 687         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 688         pixels+=line_size;\
 689         block +=line_size;\
 690     }\
 691 }\
 692 \
 693 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 694 {\
 695     int i;\
 696     for(i=0; i<h; i++){\
 697         const uint64_t a= AV_RN64(pixels  );\
 698         const uint64_t b= AV_RN64(pixels+1);\
 699         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 700         pixels+=line_size;\
 701         block +=line_size;\
 702     }\
 703 }\
 704 \
 705 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 706 {\
 707     int i;\
 708     for(i=0; i<h; i++){\
 709         const uint64_t a= AV_RN64(pixels          );\
 710         const uint64_t b= AV_RN64(pixels+line_size);\
 711         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 712         pixels+=line_size;\
 713         block +=line_size;\
 714     }\
 715 }\
 716 \
 717 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 718 {\
 719     int i;\
 720     for(i=0; i<h; i++){\
 721         const uint64_t a= AV_RN64(pixels          );\
 722         const uint64_t b= AV_RN64(pixels+line_size);\
 723         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 724         pixels+=line_size;\
 725         block +=line_size;\
 726     }\
 727 }\
 728 \
 729 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 730 {\
 731         int i;\
 732         const uint64_t a= AV_RN64(pixels  );\
 733         const uint64_t b= AV_RN64(pixels+1);\
 734         uint64_t l0=  (a&0x0303030303030303ULL)\
 735                     + (b&0x0303030303030303ULL)\
 736                     + 0x0202020202020202ULL;\
 737         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 738                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 739         uint64_t l1,h1;\
 740 \
 741         pixels+=line_size;\
 742         for(i=0; i<h; i+=2){\
 743             uint64_t a= AV_RN64(pixels  );\
 744             uint64_t b= AV_RN64(pixels+1);\
 745             l1=  (a&0x0303030303030303ULL)\
 746                + (b&0x0303030303030303ULL);\
 747             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 748               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 749             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 750             pixels+=line_size;\
 751             block +=line_size;\
 752             a= AV_RN64(pixels  );\
 753             b= AV_RN64(pixels+1);\
 754             l0=  (a&0x0303030303030303ULL)\
 755                + (b&0x0303030303030303ULL)\
 756                + 0x0202020202020202ULL;\
 757             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 758               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 759             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 760             pixels+=line_size;\
 761             block +=line_size;\
 762         }\
 763 }\
 764 \
 765 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 766 {\
 767         int i;\
 768         const uint64_t a= AV_RN64(pixels  );\
 769         const uint64_t b= AV_RN64(pixels+1);\
 770         uint64_t l0=  (a&0x0303030303030303ULL)\
 771                     + (b&0x0303030303030303ULL)\
 772                     + 0x0101010101010101ULL;\
 773         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 774                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 775         uint64_t l1,h1;\
 776 \
 777         pixels+=line_size;\
 778         for(i=0; i<h; i+=2){\
 779             uint64_t a= AV_RN64(pixels  );\
 780             uint64_t b= AV_RN64(pixels+1);\
 781             l1=  (a&0x0303030303030303ULL)\
 782                + (b&0x0303030303030303ULL);\
 783             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 784               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 785             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 786             pixels+=line_size;\
 787             block +=line_size;\
 788             a= AV_RN64(pixels  );\
 789             b= AV_RN64(pixels+1);\
 790             l0=  (a&0x0303030303030303ULL)\
 791                + (b&0x0303030303030303ULL)\
 792                + 0x0101010101010101ULL;\
 793             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 794               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 795             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 796             pixels+=line_size;\
 797             block +=line_size;\
 798         }\
 799 }\
 800 \
 801 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 802 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 803 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 804 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 805 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 806 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 807 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 808
 809 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 810 #else // 64 bit variant
 811
 812 #define PIXOP2(OPNAME, OP) \
 813 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 814     int i;\
 815     for(i=0; i<h; i++){\
 816         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 817         pixels+=line_size;\
 818         block +=line_size;\
 819     }\
 820 }\
 821 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 822     int i;\
 823     for(i=0; i<h; i++){\
 824         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 825         pixels+=line_size;\
 826         block +=line_size;\
 827     }\
 828 }\
 829 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 830     int i;\
 831     for(i=0; i<h; i++){\
 832         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 833         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 834         pixels+=line_size;\
 835         block +=line_size;\
 836     }\
 837 }\
 838 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 839     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 840 }\
 841 \
 842 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 843                                                 int src_stride1, int src_stride2, int h){\
 844     int i;\
 845     for(i=0; i<h; i++){\
 846         uint32_t a,b;\
 847         a= AV_RN32(&src1[i*src_stride1  ]);\
 848         b= AV_RN32(&src2[i*src_stride2  ]);\
 849         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 850         a= AV_RN32(&src1[i*src_stride1+4]);\
 851         b= AV_RN32(&src2[i*src_stride2+4]);\
 852         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 853     }\
 854 }\
 855 \
 856 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 857                                                 int src_stride1, int src_stride2, int h){\
 858     int i;\
 859     for(i=0; i<h; i++){\
 860         uint32_t a,b;\
 861         a= AV_RN32(&src1[i*src_stride1  ]);\
 862         b= AV_RN32(&src2[i*src_stride2  ]);\
 863         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 864         a= AV_RN32(&src1[i*src_stride1+4]);\
 865         b= AV_RN32(&src2[i*src_stride2+4]);\
 866         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 867     }\
 868 }\
 869 \
 870 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 871                                                 int src_stride1, int src_stride2, int h){\
 872     int i;\
 873     for(i=0; i<h; i++){\
 874         uint32_t a,b;\
 875         a= AV_RN32(&src1[i*src_stride1  ]);\
 876         b= AV_RN32(&src2[i*src_stride2  ]);\
 877         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 878     }\
 879 }\
 880 \
 881 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 882                                                 int src_stride1, int src_stride2, int h){\
 883     int i;\
 884     for(i=0; i<h; i++){\
 885         uint32_t a,b;\
 886         a= AV_RN16(&src1[i*src_stride1  ]);\
 887         b= AV_RN16(&src2[i*src_stride2  ]);\
 888         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 889     }\
 890 }\
 891 \
 892 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 893                                                 int src_stride1, int src_stride2, int h){\
 894     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 895     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 896 }\
 897 \
 898 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 899                                                 int src_stride1, int src_stride2, int h){\
 900     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 901     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 902 }\
 903 \
 904 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 905     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 906 }\
 907 \
 908 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 909     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 910 }\
 911 \
 912 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 913     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 914 }\
 915 \
 916 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 917     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 918 }\
 919 \
 920 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 921                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 922     int i;\
 923     for(i=0; i<h; i++){\
 924         uint32_t a, b, c, d, l0, l1, h0, h1;\
 925         a= AV_RN32(&src1[i*src_stride1]);\
 926         b= AV_RN32(&src2[i*src_stride2]);\
 927         c= AV_RN32(&src3[i*src_stride3]);\
 928         d= AV_RN32(&src4[i*src_stride4]);\
 929         l0=  (a&0x03030303UL)\
 930            + (b&0x03030303UL)\
 931            + 0x02020202UL;\
 932         h0= ((a&0xFCFCFCFCUL)>>2)\
 933           + ((b&0xFCFCFCFCUL)>>2);\
 934         l1=  (c&0x03030303UL)\
 935            + (d&0x03030303UL);\
 936         h1= ((c&0xFCFCFCFCUL)>>2)\
 937           + ((d&0xFCFCFCFCUL)>>2);\
 938         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 939         a= AV_RN32(&src1[i*src_stride1+4]);\
 940         b= AV_RN32(&src2[i*src_stride2+4]);\
 941         c= AV_RN32(&src3[i*src_stride3+4]);\
 942         d= AV_RN32(&src4[i*src_stride4+4]);\
 943         l0=  (a&0x03030303UL)\
 944            + (b&0x03030303UL)\
 945            + 0x02020202UL;\
 946         h0= ((a&0xFCFCFCFCUL)>>2)\
 947           + ((b&0xFCFCFCFCUL)>>2);\
 948         l1=  (c&0x03030303UL)\
 949            + (d&0x03030303UL);\
 950         h1= ((c&0xFCFCFCFCUL)>>2)\
 951           + ((d&0xFCFCFCFCUL)>>2);\
 952         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 953     }\
 954 }\
 955 \
 956 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 957     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 958 }\
 959 \
 960 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 961     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 962 }\
 963 \
 964 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 965     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 966 }\
 967 \
 968 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 969     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 970 }\
 971 \
 972 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 973                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 974     int i;\
 975     for(i=0; i<h; i++){\
 976         uint32_t a, b, c, d, l0, l1, h0, h1;\
 977         a= AV_RN32(&src1[i*src_stride1]);\
 978         b= AV_RN32(&src2[i*src_stride2]);\
 979         c= AV_RN32(&src3[i*src_stride3]);\
 980         d= AV_RN32(&src4[i*src_stride4]);\
 981         l0=  (a&0x03030303UL)\
 982            + (b&0x03030303UL)\
 983            + 0x01010101UL;\
 984         h0= ((a&0xFCFCFCFCUL)>>2)\
 985           + ((b&0xFCFCFCFCUL)>>2);\
 986         l1=  (c&0x03030303UL)\
 987            + (d&0x03030303UL);\
 988         h1= ((c&0xFCFCFCFCUL)>>2)\
 989           + ((d&0xFCFCFCFCUL)>>2);\
 990         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 991         a= AV_RN32(&src1[i*src_stride1+4]);\
 992         b= AV_RN32(&src2[i*src_stride2+4]);\
 993         c= AV_RN32(&src3[i*src_stride3+4]);\
 994         d= AV_RN32(&src4[i*src_stride4+4]);\
 995         l0=  (a&0x03030303UL)\
 996            + (b&0x03030303UL)\
 997            + 0x01010101UL;\
 998         h0= ((a&0xFCFCFCFCUL)>>2)\
 999           + ((b&0xFCFCFCFCUL)>>2);\
1000         l1=  (c&0x03030303UL)\
1001            + (d&0x03030303UL);\
1002         h1= ((c&0xFCFCFCFCUL)>>2)\
1003           + ((d&0xFCFCFCFCUL)>>2);\
1004         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1005     }\
1006 }\
1007 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1008                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1009     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1010     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1011 }\
1012 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1013                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1014     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1015     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1016 }\
1017 \
1018 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1019 {\
1020         int i, a0, b0, a1, b1;\
1021         a0= pixels[0];\
1022         b0= pixels[1] + 2;\
1023         a0 += b0;\
1024         b0 += pixels[2];\
1025 \
1026         pixels+=line_size;\
1027         for(i=0; i<h; i+=2){\
1028             a1= pixels[0];\
1029             b1= pixels[1];\
1030             a1 += b1;\
1031             b1 += pixels[2];\
1032 \
1033             block[0]= (a1+a0)>>2; /* FIXME non put */\
1034             block[1]= (b1+b0)>>2;\
1035 \
1036             pixels+=line_size;\
1037             block +=line_size;\
1038 \
1039             a0= pixels[0];\
1040             b0= pixels[1] + 2;\
1041             a0 += b0;\
1042             b0 += pixels[2];\
1043 \
1044             block[0]= (a1+a0)>>2;\
1045             block[1]= (b1+b0)>>2;\
1046             pixels+=line_size;\
1047             block +=line_size;\
1048         }\
1049 }\
1050 \
1051 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1052 {\
1053         int i;\
1054         const uint32_t a= AV_RN32(pixels  );\
1055         const uint32_t b= AV_RN32(pixels+1);\
1056         uint32_t l0=  (a&0x03030303UL)\
1057                     + (b&0x03030303UL)\
1058                     + 0x02020202UL;\
1059         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1060                    + ((b&0xFCFCFCFCUL)>>2);\
1061         uint32_t l1,h1;\
1062 \
1063         pixels+=line_size;\
1064         for(i=0; i<h; i+=2){\
1065             uint32_t a= AV_RN32(pixels  );\
1066             uint32_t b= AV_RN32(pixels+1);\
1067             l1=  (a&0x03030303UL)\
1068                + (b&0x03030303UL);\
1069             h1= ((a&0xFCFCFCFCUL)>>2)\
1070               + ((b&0xFCFCFCFCUL)>>2);\
1071             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1072             pixels+=line_size;\
1073             block +=line_size;\
1074             a= AV_RN32(pixels  );\
1075             b= AV_RN32(pixels+1);\
1076             l0=  (a&0x03030303UL)\
1077                + (b&0x03030303UL)\
1078                + 0x02020202UL;\
1079             h0= ((a&0xFCFCFCFCUL)>>2)\
1080               + ((b&0xFCFCFCFCUL)>>2);\
1081             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1082             pixels+=line_size;\
1083             block +=line_size;\
1084         }\
1085 }\
1086 \
1087 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1088 {\
1089     int j;\
1090     for(j=0; j<2; j++){\
1091         int i;\
1092         const uint32_t a= AV_RN32(pixels  );\
1093         const uint32_t b= AV_RN32(pixels+1);\
1094         uint32_t l0=  (a&0x03030303UL)\
1095                     + (b&0x03030303UL)\
1096                     + 0x02020202UL;\
1097         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1098                    + ((b&0xFCFCFCFCUL)>>2);\
1099         uint32_t l1,h1;\
1100 \
1101         pixels+=line_size;\
1102         for(i=0; i<h; i+=2){\
1103             uint32_t a= AV_RN32(pixels  );\
1104             uint32_t b= AV_RN32(pixels+1);\
1105             l1=  (a&0x03030303UL)\
1106                + (b&0x03030303UL);\
1107             h1= ((a&0xFCFCFCFCUL)>>2)\
1108               + ((b&0xFCFCFCFCUL)>>2);\
1109             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1110             pixels+=line_size;\
1111             block +=line_size;\
1112             a= AV_RN32(pixels  );\
1113             b= AV_RN32(pixels+1);\
1114             l0=  (a&0x03030303UL)\
1115                + (b&0x03030303UL)\
1116                + 0x02020202UL;\
1117             h0= ((a&0xFCFCFCFCUL)>>2)\
1118               + ((b&0xFCFCFCFCUL)>>2);\
1119             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1120             pixels+=line_size;\
1121             block +=line_size;\
1122         }\
1123         pixels+=4-line_size*(h+1);\
1124         block +=4-line_size*h;\
1125     }\
1126 }\
1127 \
1128 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1129 {\
1130     int j;\
1131     for(j=0; j<2; j++){\
1132         int i;\
1133         const uint32_t a= AV_RN32(pixels  );\
1134         const uint32_t b= AV_RN32(pixels+1);\
1135         uint32_t l0=  (a&0x03030303UL)\
1136                     + (b&0x03030303UL)\
1137                     + 0x01010101UL;\
1138         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1139                    + ((b&0xFCFCFCFCUL)>>2);\
1140         uint32_t l1,h1;\
1141 \
1142         pixels+=line_size;\
1143         for(i=0; i<h; i+=2){\
1144             uint32_t a= AV_RN32(pixels  );\
1145             uint32_t b= AV_RN32(pixels+1);\
1146             l1=  (a&0x03030303UL)\
1147                + (b&0x03030303UL);\
1148             h1= ((a&0xFCFCFCFCUL)>>2)\
1149               + ((b&0xFCFCFCFCUL)>>2);\
1150             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1151             pixels+=line_size;\
1152             block +=line_size;\
1153             a= AV_RN32(pixels  );\
1154             b= AV_RN32(pixels+1);\
1155             l0=  (a&0x03030303UL)\
1156                + (b&0x03030303UL)\
1157                + 0x01010101UL;\
1158             h0= ((a&0xFCFCFCFCUL)>>2)\
1159               + ((b&0xFCFCFCFCUL)>>2);\
1160             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1161             pixels+=line_size;\
1162             block +=line_size;\
1163         }\
1164         pixels+=4-line_size*(h+1);\
1165         block +=4-line_size*h;\
1166     }\
1167 }\
1168 \
1169 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1170 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1171 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1172 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1173 av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1174 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1175 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1176 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1177
1178 #define op_avg(a, b) a = rnd_avg32(a, b)
1179 #endif
1180 #define op_put(a, b) a = b
1181
1182 PIXOP2(avg, op_avg)
1183 PIXOP2(put, op_put)
1184 #undef op_avg
1185 #undef op_put
1186
1187 #define put_no_rnd_pixels8_c  put_pixels8_c
1188 #define put_no_rnd_pixels16_c put_pixels16_c
1189
1190 #define avg2(a,b) ((a+b+1)>>1)
1191 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1192
1193 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1194     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1195 }
1196
1197 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1198     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1199 }
1200
1201 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1202 {
1203     const int A=(16-x16)*(16-y16);
1204     const int B=(   x16)*(16-y16);
1205     const int C=(16-x16)*(   y16);
1206     const int D=(   x16)*(   y16);
1207     int i;
1208
1209     for(i=0; i<h; i++)
1210     {
1211         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1212         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1213         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1214         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1215         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1216         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1217         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1218         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1219         dst+= stride;
1220         src+= stride;
1221     }
1222 }
1223
1224 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1225                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1226 {
1227     int y, vx, vy;
1228     const int s= 1<<shift;
1229
1230     width--;
1231     height--;
1232
1233     for(y=0; y<h; y++){
1234         int x;
1235
1236         vx= ox;
1237         vy= oy;
1238         for(x=0; x<8; x++){ //XXX FIXME optimize
1239             int src_x, src_y, frac_x, frac_y, index;
1240
1241             src_x= vx>>16;
1242             src_y= vy>>16;
1243             frac_x= src_x&(s-1);
1244             frac_y= src_y&(s-1);
1245             src_x>>=shift;
1246             src_y>>=shift;
1247
1248             if((unsigned)src_x < width){
1249                 if((unsigned)src_y < height){
1250                     index= src_x + src_y*stride;
1251                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1252                                            + src[index       +1]*   frac_x )*(s-frac_y)
1253                                         + (  src[index+stride  ]*(s-frac_x)
1254                                            + src[index+stride+1]*   frac_x )*   frac_y
1255                                         + r)>>(shift*2);
1256                 }else{
1257                     index= src_x + av_clip(src_y, 0, height)*stride;
1258                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1259                                           + src[index       +1]*   frac_x )*s
1260                                         + r)>>(shift*2);
1261                 }
1262             }else{
1263                 if((unsigned)src_y < height){
1264                     index= av_clip(src_x, 0, width) + src_y*stride;
1265                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1266                                            + src[index+stride  ]*   frac_y )*s
1267                                         + r)>>(shift*2);
1268                 }else{
1269                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1270                     dst[y*stride + x]=    src[index         ];
1271                 }
1272             }
1273
1274             vx+= dxx;
1275             vy+= dyx;
1276         }
1277         ox += dxy;
1278         oy += dyy;
1279     }
1280 }
1281
1282 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1283     switch(width){
1284     case 2: put_pixels2_c (dst, src, stride, height); break;
1285     case 4: put_pixels4_c (dst, src, stride, height); break;
1286     case 8: put_pixels8_c (dst, src, stride, height); break;
1287     case 16:put_pixels16_c(dst, src, stride, height); break;
1288     }
1289 }
1290
1291 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1292     int i,j;
1293     for (i=0; i < height; i++) {
1294       for (j=0; j < width; j++) {
1295         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1296       }
1297       src += stride;
1298       dst += stride;
1299     }
1300 }
1301
1302 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1303     int i,j;
1304     for (i=0; i < height; i++) {
1305       for (j=0; j < width; j++) {
1306         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1307       }
1308       src += stride;
1309       dst += stride;
1310     }
1311 }
1312
1313 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1314     int i,j;
1315     for (i=0; i < height; i++) {
1316       for (j=0; j < width; j++) {
1317         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1318       }
1319       src += stride;
1320       dst += stride;
1321     }
1322 }
1323
1324 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1325     int i,j;
1326     for (i=0; i < height; i++) {
1327       for (j=0; j < width; j++) {
1328         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1329       }
1330       src += stride;
1331       dst += stride;
1332     }
1333 }
1334
1335 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1336     int i,j;
1337     for (i=0; i < height; i++) {
1338       for (j=0; j < width; j++) {
1339         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1340       }
1341       src += stride;
1342       dst += stride;
1343     }
1344 }
1345
1346 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1347     int i,j;
1348     for (i=0; i < height; i++) {
1349       for (j=0; j < width; j++) {
1350         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1351       }
1352       src += stride;
1353       dst += stride;
1354     }
1355 }
1356
1357 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1358     int i,j;
1359     for (i=0; i < height; i++) {
1360       for (j=0; j < width; j++) {
1361         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1362       }
1363       src += stride;
1364       dst += stride;
1365     }
1366 }
1367
1368 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1369     int i,j;
1370     for (i=0; i < height; i++) {
1371       for (j=0; j < width; j++) {
1372         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1373       }
1374       src += stride;
1375       dst += stride;
1376     }
1377 }
1378
1379 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1380     switch(width){
1381     case 2: avg_pixels2_c (dst, src, stride, height); break;
1382     case 4: avg_pixels4_c (dst, src, stride, height); break;
1383     case 8: avg_pixels8_c (dst, src, stride, height); break;
1384     case 16:avg_pixels16_c(dst, src, stride, height); break;
1385     }
1386 }
1387
1388 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389     int i,j;
1390     for (i=0; i < height; i++) {
1391       for (j=0; j < width; j++) {
1392         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1393       }
1394       src += stride;
1395       dst += stride;
1396     }
1397 }
1398
1399 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1400     int i,j;
1401     for (i=0; i < height; i++) {
1402       for (j=0; j < width; j++) {
1403         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1404       }
1405       src += stride;
1406       dst += stride;
1407     }
1408 }
1409
1410 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1411     int i,j;
1412     for (i=0; i < height; i++) {
1413       for (j=0; j < width; j++) {
1414         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1415       }
1416       src += stride;
1417       dst += stride;
1418     }
1419 }
1420
1421 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1422     int i,j;
1423     for (i=0; i < height; i++) {
1424       for (j=0; j < width; j++) {
1425         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1426       }
1427       src += stride;
1428       dst += stride;
1429     }
1430 }
1431
1432 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1433     int i,j;
1434     for (i=0; i < height; i++) {
1435       for (j=0; j < width; j++) {
1436         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1437       }
1438       src += stride;
1439       dst += stride;
1440     }
1441 }
1442
1443 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1444     int i,j;
1445     for (i=0; i < height; i++) {
1446       for (j=0; j < width; j++) {
1447         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1448       }
1449       src += stride;
1450       dst += stride;
1451     }
1452 }
1453
1454 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1455     int i,j;
1456     for (i=0; i < height; i++) {
1457       for (j=0; j < width; j++) {
1458         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1459       }
1460       src += stride;
1461       dst += stride;
1462     }
1463 }
1464
1465 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1466     int i,j;
1467     for (i=0; i < height; i++) {
1468       for (j=0; j < width; j++) {
1469         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1470       }
1471       src += stride;
1472       dst += stride;
1473     }
1474 }
1475 #if 0
1476 #define TPEL_WIDTH(width)\
1477 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1478     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1479 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1480     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1481 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1482     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1483 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1484     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1485 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1486     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1487 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1488     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1489 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1490     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1491 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1492     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1493 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1494     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1495 #endif
1496
1497 #define H264_CHROMA_MC(OPNAME, OP)\
1498 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1499     const int A=(8-x)*(8-y);\
1500     const int B=(  x)*(8-y);\
1501     const int C=(8-x)*(  y);\
1502     const int D=(  x)*(  y);\
1503     int i;\
1504     \
1505     assert(x<8 && y<8 && x>=0 && y>=0);\
1506 \
1507     if(D){\
1508         for(i=0; i<h; i++){\
1509             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1510             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1511             dst+= stride;\
1512             src+= stride;\
1513         }\
1514     }else{\
1515         const int E= B+C;\
1516         const int step= C ? stride : 1;\
1517         for(i=0; i<h; i++){\
1518             OP(dst[0], (A*src[0] + E*src[step+0]));\
1519             OP(dst[1], (A*src[1] + E*src[step+1]));\
1520             dst+= stride;\
1521             src+= stride;\
1522         }\
1523     }\
1524 }\
1525 \
1526 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1527     const int A=(8-x)*(8-y);\
1528     const int B=(  x)*(8-y);\
1529     const int C=(8-x)*(  y);\
1530     const int D=(  x)*(  y);\
1531     int i;\
1532     \
1533     assert(x<8 && y<8 && x>=0 && y>=0);\
1534 \
1535     if(D){\
1536         for(i=0; i<h; i++){\
1537             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1538             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1539             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1540             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1541             dst+= stride;\
1542             src+= stride;\
1543         }\
1544     }else{\
1545         const int E= B+C;\
1546         const int step= C ? stride : 1;\
1547         for(i=0; i<h; i++){\
1548             OP(dst[0], (A*src[0] + E*src[step+0]));\
1549             OP(dst[1], (A*src[1] + E*src[step+1]));\
1550             OP(dst[2], (A*src[2] + E*src[step+2]));\
1551             OP(dst[3], (A*src[3] + E*src[step+3]));\
1552             dst+= stride;\
1553             src+= stride;\
1554         }\
1555     }\
1556 }\
1557 \
1558 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1559     const int A=(8-x)*(8-y);\
1560     const int B=(  x)*(8-y);\
1561     const int C=(8-x)*(  y);\
1562     const int D=(  x)*(  y);\
1563     int i;\
1564     \
1565     assert(x<8 && y<8 && x>=0 && y>=0);\
1566 \
1567     if(D){\
1568         for(i=0; i<h; i++){\
1569             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1570             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1571             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1572             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1573             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1574             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1575             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1576             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1577             dst+= stride;\
1578             src+= stride;\
1579         }\
1580     }else{\
1581         const int E= B+C;\
1582         const int step= C ? stride : 1;\
1583         for(i=0; i<h; i++){\
1584             OP(dst[0], (A*src[0] + E*src[step+0]));\
1585             OP(dst[1], (A*src[1] + E*src[step+1]));\
1586             OP(dst[2], (A*src[2] + E*src[step+2]));\
1587             OP(dst[3], (A*src[3] + E*src[step+3]));\
1588             OP(dst[4], (A*src[4] + E*src[step+4]));\
1589             OP(dst[5], (A*src[5] + E*src[step+5]));\
1590             OP(dst[6], (A*src[6] + E*src[step+6]));\
1591             OP(dst[7], (A*src[7] + E*src[step+7]));\
1592             dst+= stride;\
1593             src+= stride;\
1594         }\
1595     }\
1596 }
1597
1598 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1599 #define op_put(a, b) a = (((b) + 32)>>6)
1600
1601 H264_CHROMA_MC(put_       , op_put)
1602 H264_CHROMA_MC(avg_       , op_avg)
1603 #undef op_avg
1604 #undef op_put
1605
1606 #define QPEL_MC(r, OPNAME, RND, OP) \
1607 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1608     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1609     int i;\
1610     for(i=0; i<h; i++)\
1611     {\
1612         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1613         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1614         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1615         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1616         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1617         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1618         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1619         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1620         dst+=dstStride;\
1621         src+=srcStride;\
1622     }\
1623 }\
1624 \
1625 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1626     const int w=8;\
1627     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1628     int i;\
1629     for(i=0; i<w; i++)\
1630     {\
1631         const int src0= src[0*srcStride];\
1632         const int src1= src[1*srcStride];\
1633         const int src2= src[2*srcStride];\
1634         const int src3= src[3*srcStride];\
1635         const int src4= src[4*srcStride];\
1636         const int src5= src[5*srcStride];\
1637         const int src6= src[6*srcStride];\
1638         const int src7= src[7*srcStride];\
1639         const int src8= src[8*srcStride];\
1640         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1641         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1642         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1643         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1644         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1645         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1646         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1647         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1648         dst++;\
1649         src++;\
1650     }\
1651 }\
1652 \
1653 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1654     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1655     int i;\
1656     \
1657     for(i=0; i<h; i++)\
1658     {\
1659         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1660         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1661         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1662         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1663         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1664         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1665         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1666         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1667         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1668         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1669         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1670         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1671         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1672         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1673         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1674         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1675         dst+=dstStride;\
1676         src+=srcStride;\
1677     }\
1678 }\
1679 \
1680 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1681     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1682     int i;\
1683     const int w=16;\
1684     for(i=0; i<w; i++)\
1685     {\
1686         const int src0= src[0*srcStride];\
1687         const int src1= src[1*srcStride];\
1688         const int src2= src[2*srcStride];\
1689         const int src3= src[3*srcStride];\
1690         const int src4= src[4*srcStride];\
1691         const int src5= src[5*srcStride];\
1692         const int src6= src[6*srcStride];\
1693         const int src7= src[7*srcStride];\
1694         const int src8= src[8*srcStride];\
1695         const int src9= src[9*srcStride];\
1696         const int src10= src[10*srcStride];\
1697         const int src11= src[11*srcStride];\
1698         const int src12= src[12*srcStride];\
1699         const int src13= src[13*srcStride];\
1700         const int src14= src[14*srcStride];\
1701         const int src15= src[15*srcStride];\
1702         const int src16= src[16*srcStride];\
1703         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1704         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1705         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1706         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1707         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1708         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1709         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1710         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1711         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1712         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1713         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1714         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1715         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1716         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1717         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1718         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1719         dst++;\
1720         src++;\
1721     }\
1722 }\
1723 \
1724 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1725     uint8_t half[64];\
1726     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1727     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1728 }\
1729 \
1730 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1731     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1732 }\
1733 \
1734 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1735     uint8_t half[64];\
1736     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1737     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1738 }\
1739 \
1740 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1741     uint8_t full[16*9];\
1742     uint8_t half[64];\
1743     copy_block9(full, src, 16, stride, 9);\
1744     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1745     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1746 }\
1747 \
1748 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1749     uint8_t full[16*9];\
1750     copy_block9(full, src, 16, stride, 9);\
1751     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1752 }\
1753 \
1754 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1755     uint8_t full[16*9];\
1756     uint8_t half[64];\
1757     copy_block9(full, src, 16, stride, 9);\
1758     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1759     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1760 }\
1761 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1762     uint8_t full[16*9];\
1763     uint8_t halfH[72];\
1764     uint8_t halfV[64];\
1765     uint8_t halfHV[64];\
1766     copy_block9(full, src, 16, stride, 9);\
1767     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1768     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1769     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1770     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1771 }\
1772 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1773     uint8_t full[16*9];\
1774     uint8_t halfH[72];\
1775     uint8_t halfHV[64];\
1776     copy_block9(full, src, 16, stride, 9);\
1777     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1778     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1779     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1780     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1781 }\
1782 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1783     uint8_t full[16*9];\
1784     uint8_t halfH[72];\
1785     uint8_t halfV[64];\
1786     uint8_t halfHV[64];\
1787     copy_block9(full, src, 16, stride, 9);\
1788     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1789     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1790     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1791     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1792 }\
1793 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1794     uint8_t full[16*9];\
1795     uint8_t halfH[72];\
1796     uint8_t halfHV[64];\
1797     copy_block9(full, src, 16, stride, 9);\
1798     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1799     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1800     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1801     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1802 }\
1803 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1804     uint8_t full[16*9];\
1805     uint8_t halfH[72];\
1806     uint8_t halfV[64];\
1807     uint8_t halfHV[64];\
1808     copy_block9(full, src, 16, stride, 9);\
1809     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1810     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1811     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1812     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1813 }\
1814 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1815     uint8_t full[16*9];\
1816     uint8_t halfH[72];\
1817     uint8_t halfHV[64];\
1818     copy_block9(full, src, 16, stride, 9);\
1819     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1820     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1821     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1822     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1823 }\
1824 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1825     uint8_t full[16*9];\
1826     uint8_t halfH[72];\
1827     uint8_t halfV[64];\
1828     uint8_t halfHV[64];\
1829     copy_block9(full, src, 16, stride, 9);\
1830     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1831     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1832     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1833     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1834 }\
1835 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1836     uint8_t full[16*9];\
1837     uint8_t halfH[72];\
1838     uint8_t halfHV[64];\
1839     copy_block9(full, src, 16, stride, 9);\
1840     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1841     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1842     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1843     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1844 }\
1845 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1846     uint8_t halfH[72];\
1847     uint8_t halfHV[64];\
1848     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1849     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1850     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1851 }\
1852 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1853     uint8_t halfH[72];\
1854     uint8_t halfHV[64];\
1855     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1856     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1857     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1858 }\
1859 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1860     uint8_t full[16*9];\
1861     uint8_t halfH[72];\
1862     uint8_t halfV[64];\
1863     uint8_t halfHV[64];\
1864     copy_block9(full, src, 16, stride, 9);\
1865     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1866     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1867     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1868     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1869 }\
1870 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1871     uint8_t full[16*9];\
1872     uint8_t halfH[72];\
1873     copy_block9(full, src, 16, stride, 9);\
1874     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1875     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1876     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1877 }\
1878 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1879     uint8_t full[16*9];\
1880     uint8_t halfH[72];\
1881     uint8_t halfV[64];\
1882     uint8_t halfHV[64];\
1883     copy_block9(full, src, 16, stride, 9);\
1884     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1885     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1886     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1887     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1888 }\
1889 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1890     uint8_t full[16*9];\
1891     uint8_t halfH[72];\
1892     copy_block9(full, src, 16, stride, 9);\
1893     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1894     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1895     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1896 }\
1897 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1898     uint8_t halfH[72];\
1899     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1900     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1901 }\
1902 \
1903 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1904     uint8_t half[256];\
1905     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1906     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1907 }\
1908 \
1909 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1910     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1911 }\
1912 \
1913 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1914     uint8_t half[256];\
1915     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1916     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1917 }\
1918 \
1919 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1920     uint8_t full[24*17];\
1921     uint8_t half[256];\
1922     copy_block17(full, src, 24, stride, 17);\
1923     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1924     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1925 }\
1926 \
1927 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1928     uint8_t full[24*17];\
1929     copy_block17(full, src, 24, stride, 17);\
1930     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1931 }\
1932 \
1933 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1934     uint8_t full[24*17];\
1935     uint8_t half[256];\
1936     copy_block17(full, src, 24, stride, 17);\
1937     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1938     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1939 }\
1940 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1941     uint8_t full[24*17];\
1942     uint8_t halfH[272];\
1943     uint8_t halfV[256];\
1944     uint8_t halfHV[256];\
1945     copy_block17(full, src, 24, stride, 17);\
1946     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1947     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1948     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1949     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1950 }\
1951 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1952     uint8_t full[24*17];\
1953     uint8_t halfH[272];\
1954     uint8_t halfHV[256];\
1955     copy_block17(full, src, 24, stride, 17);\
1956     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1957     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1958     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1959     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1960 }\
1961 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1962     uint8_t full[24*17];\
1963     uint8_t halfH[272];\
1964     uint8_t halfV[256];\
1965     uint8_t halfHV[256];\
1966     copy_block17(full, src, 24, stride, 17);\
1967     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1968     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1969     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1970     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1971 }\
1972 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1973     uint8_t full[24*17];\
1974     uint8_t halfH[272];\
1975     uint8_t halfHV[256];\
1976     copy_block17(full, src, 24, stride, 17);\
1977     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1978     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1979     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1980     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1981 }\
1982 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1983     uint8_t full[24*17];\
1984     uint8_t halfH[272];\
1985     uint8_t halfV[256];\
1986     uint8_t halfHV[256];\
1987     copy_block17(full, src, 24, stride, 17);\
1988     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1989     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1990     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1991     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1992 }\
1993 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1994     uint8_t full[24*17];\
1995     uint8_t halfH[272];\
1996     uint8_t halfHV[256];\
1997     copy_block17(full, src, 24, stride, 17);\
1998     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1999     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2000     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2001     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2002 }\
2003 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2004     uint8_t full[24*17];\
2005     uint8_t halfH[272];\
2006     uint8_t halfV[256];\
2007     uint8_t halfHV[256];\
2008     copy_block17(full, src, 24, stride, 17);\
2009     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2010     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2011     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2012     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2013 }\
2014 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2015     uint8_t full[24*17];\
2016     uint8_t halfH[272];\
2017     uint8_t halfHV[256];\
2018     copy_block17(full, src, 24, stride, 17);\
2019     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2020     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2021     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2022     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2023 }\
2024 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2025     uint8_t halfH[272];\
2026     uint8_t halfHV[256];\
2027     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2028     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2029     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2030 }\
2031 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2032     uint8_t halfH[272];\
2033     uint8_t halfHV[256];\
2034     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2035     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2036     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2037 }\
2038 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2039     uint8_t full[24*17];\
2040     uint8_t halfH[272];\
2041     uint8_t halfV[256];\
2042     uint8_t halfHV[256];\
2043     copy_block17(full, src, 24, stride, 17);\
2044     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2045     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2046     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2047     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2048 }\
2049 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2050     uint8_t full[24*17];\
2051     uint8_t halfH[272];\
2052     copy_block17(full, src, 24, stride, 17);\
2053     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2054     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2055     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2056 }\
2057 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2058     uint8_t full[24*17];\
2059     uint8_t halfH[272];\
2060     uint8_t halfV[256];\
2061     uint8_t halfHV[256];\
2062     copy_block17(full, src, 24, stride, 17);\
2063     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2064     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2065     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2066     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2067 }\
2068 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2069     uint8_t full[24*17];\
2070     uint8_t halfH[272];\
2071     copy_block17(full, src, 24, stride, 17);\
2072     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2073     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2074     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2075 }\
2076 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2077     uint8_t halfH[272];\
2078     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2079     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2080 }
2081
2082 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2083 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2084 #define op_put(a, b) a = cm[((b) + 16)>>5]
2085 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2086
2087 QPEL_MC(0, put_       , _       , op_put)
2088 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2089 QPEL_MC(0, avg_       , _       , op_avg)
2090 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2091 #undef op_avg
2092 #undef op_avg_no_rnd
2093 #undef op_put
2094 #undef op_put_no_rnd
2095
2096 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
2097 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
2098 #define put_qpel16_mc00_c ff_put_pixels16x16_c
2099 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2100 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
2101 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2102
2103 #if 1
2104 #define H264_LOWPASS(OPNAME, OP, OP2) \
2105 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2106     const int h=2;\
2107     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2108     int i;\
2109     for(i=0; i<h; i++)\
2110     {\
2111         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2112         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2113         dst+=dstStride;\
2114         src+=srcStride;\
2115     }\
2116 }\
2117 \
2118 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2119     const int w=2;\
2120     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2121     int i;\
2122     for(i=0; i<w; i++)\
2123     {\
2124         const int srcB= src[-2*srcStride];\
2125         const int srcA= src[-1*srcStride];\
2126         const int src0= src[0 *srcStride];\
2127         const int src1= src[1 *srcStride];\
2128         const int src2= src[2 *srcStride];\
2129         const int src3= src[3 *srcStride];\
2130         const int src4= src[4 *srcStride];\
2131         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2132         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2133         dst++;\
2134         src++;\
2135     }\
2136 }\
2137 \
2138 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2139     const int h=2;\
2140     const int w=2;\
2141     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2142     int i;\
2143     src -= 2*srcStride;\
2144     for(i=0; i<h+5; i++)\
2145     {\
2146         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2147         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2148         tmp+=tmpStride;\
2149         src+=srcStride;\
2150     }\
2151     tmp -= tmpStride*(h+5-2);\
2152     for(i=0; i<w; i++)\
2153     {\
2154         const int tmpB= tmp[-2*tmpStride];\
2155         const int tmpA= tmp[-1*tmpStride];\
2156         const int tmp0= tmp[0 *tmpStride];\
2157         const int tmp1= tmp[1 *tmpStride];\
2158         const int tmp2= tmp[2 *tmpStride];\
2159         const int tmp3= tmp[3 *tmpStride];\
2160         const int tmp4= tmp[4 *tmpStride];\
2161         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2162         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2163         dst++;\
2164         tmp++;\
2165     }\
2166 }\
2167 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2168     const int h=4;\
2169     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2170     int i;\
2171     for(i=0; i<h; i++)\
2172     {\
2173         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2174         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2175         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2176         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2177         dst+=dstStride;\
2178         src+=srcStride;\
2179     }\
2180 }\
2181 \
2182 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2183     const int w=4;\
2184     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2185     int i;\
2186     for(i=0; i<w; i++)\
2187     {\
2188         const int srcB= src[-2*srcStride];\
2189         const int srcA= src[-1*srcStride];\
2190         const int src0= src[0 *srcStride];\
2191         const int src1= src[1 *srcStride];\
2192         const int src2= src[2 *srcStride];\
2193         const int src3= src[3 *srcStride];\
2194         const int src4= src[4 *srcStride];\
2195         const int src5= src[5 *srcStride];\
2196         const int src6= src[6 *srcStride];\
2197         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2198         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2199         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2200         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2201         dst++;\
2202         src++;\
2203     }\
2204 }\
2205 \
2206 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2207     const int h=4;\
2208     const int w=4;\
2209     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2210     int i;\
2211     src -= 2*srcStride;\
2212     for(i=0; i<h+5; i++)\
2213     {\
2214         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2215         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2216         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2217         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2218         tmp+=tmpStride;\
2219         src+=srcStride;\
2220     }\
2221     tmp -= tmpStride*(h+5-2);\
2222     for(i=0; i<w; i++)\
2223     {\
2224         const int tmpB= tmp[-2*tmpStride];\
2225         const int tmpA= tmp[-1*tmpStride];\
2226         const int tmp0= tmp[0 *tmpStride];\
2227         const int tmp1= tmp[1 *tmpStride];\
2228         const int tmp2= tmp[2 *tmpStride];\
2229         const int tmp3= tmp[3 *tmpStride];\
2230         const int tmp4= tmp[4 *tmpStride];\
2231         const int tmp5= tmp[5 *tmpStride];\
2232         const int tmp6= tmp[6 *tmpStride];\
2233         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2234         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2235         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2236         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2237         dst++;\
2238         tmp++;\
2239     }\
2240 }\
2241 \
2242 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2243     const int h=8;\
2244     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2245     int i;\
2246     for(i=0; i<h; i++)\
2247     {\
2248         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2249         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2250         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2251         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2252         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2253         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2254         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2255         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2256         dst+=dstStride;\
2257         src+=srcStride;\
2258     }\
2259 }\
2260 \
2261 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2262     const int w=8;\
2263     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2264     int i;\
2265     for(i=0; i<w; i++)\
2266     {\
2267         const int srcB= src[-2*srcStride];\
2268         const int srcA= src[-1*srcStride];\
2269         const int src0= src[0 *srcStride];\
2270         const int src1= src[1 *srcStride];\
2271         const int src2= src[2 *srcStride];\
2272         const int src3= src[3 *srcStride];\
2273         const int src4= src[4 *srcStride];\
2274         const int src5= src[5 *srcStride];\
2275         const int src6= src[6 *srcStride];\
2276         const int src7= src[7 *srcStride];\
2277         const int src8= src[8 *srcStride];\
2278         const int src9= src[9 *srcStride];\
2279         const int src10=src[10*srcStride];\
2280         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2281         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2282         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2283         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2284         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2285         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2286         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2287         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2288         dst++;\
2289         src++;\
2290     }\
2291 }\
2292 \
2293 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2294     const int h=8;\
2295     const int w=8;\
2296     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2297     int i;\
2298     src -= 2*srcStride;\
2299     for(i=0; i<h+5; i++)\
2300     {\
2301         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2302         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2303         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2304         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2305         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2306         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2307         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2308         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2309         tmp+=tmpStride;\
2310         src+=srcStride;\
2311     }\
2312     tmp -= tmpStride*(h+5-2);\
2313     for(i=0; i<w; i++)\
2314     {\
2315         const int tmpB= tmp[-2*tmpStride];\
2316         const int tmpA= tmp[-1*tmpStride];\
2317         const int tmp0= tmp[0 *tmpStride];\
2318         const int tmp1= tmp[1 *tmpStride];\
2319         const int tmp2= tmp[2 *tmpStride];\
2320         const int tmp3= tmp[3 *tmpStride];\
2321         const int tmp4= tmp[4 *tmpStride];\
2322         const int tmp5= tmp[5 *tmpStride];\
2323         const int tmp6= tmp[6 *tmpStride];\
2324         const int tmp7= tmp[7 *tmpStride];\
2325         const int tmp8= tmp[8 *tmpStride];\
2326         const int tmp9= tmp[9 *tmpStride];\
2327         const int tmp10=tmp[10*tmpStride];\
2328         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2329         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2330         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2331         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2332         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2333         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2334         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2335         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2336         dst++;\
2337         tmp++;\
2338     }\
2339 }\
2340 \
2341 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2342     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2343     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2344     src += 8*srcStride;\
2345     dst += 8*dstStride;\
2346     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2347     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2348 }\
2349 \
2350 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2351     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2352     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2353     src += 8*srcStride;\
2354     dst += 8*dstStride;\
2355     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2356     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2357 }\
2358 \
2359 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2360     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2361     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2362     src += 8*srcStride;\
2363     dst += 8*dstStride;\
2364     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2365     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2366 }\
2367
2368 #define H264_MC(OPNAME, SIZE) \
2369 static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2370     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2371 }\
2372 \
2373 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2374     uint8_t half[SIZE*SIZE];\
2375     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2376     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2377 }\
2378 \
2379 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2380     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2381 }\
2382 \
2383 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2384     uint8_t half[SIZE*SIZE];\
2385     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2386     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2387 }\
2388 \
2389 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2390     uint8_t full[SIZE*(SIZE+5)];\
2391     uint8_t * const full_mid= full + SIZE*2;\
2392     uint8_t half[SIZE*SIZE];\
2393     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2394     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2395     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2396 }\
2397 \
2398 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2399     uint8_t full[SIZE*(SIZE+5)];\
2400     uint8_t * const full_mid= full + SIZE*2;\
2401     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2402     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2403 }\
2404 \
2405 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2406     uint8_t full[SIZE*(SIZE+5)];\
2407     uint8_t * const full_mid= full + SIZE*2;\
2408     uint8_t half[SIZE*SIZE];\
2409     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2410     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2411     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2412 }\
2413 \
2414 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2415     uint8_t full[SIZE*(SIZE+5)];\
2416     uint8_t * const full_mid= full + SIZE*2;\
2417     uint8_t halfH[SIZE*SIZE];\
2418     uint8_t halfV[SIZE*SIZE];\
2419     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2420     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2421     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2422     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2423 }\
2424 \
2425 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2426     uint8_t full[SIZE*(SIZE+5)];\
2427     uint8_t * const full_mid= full + SIZE*2;\
2428     uint8_t halfH[SIZE*SIZE];\
2429     uint8_t halfV[SIZE*SIZE];\
2430     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2431     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2432     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2433     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2434 }\
2435 \
2436 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2437     uint8_t full[SIZE*(SIZE+5)];\
2438     uint8_t * const full_mid= full + SIZE*2;\
2439     uint8_t halfH[SIZE*SIZE];\
2440     uint8_t halfV[SIZE*SIZE];\
2441     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2442     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2443     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2444     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2445 }\
2446 \
2447 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2448     uint8_t full[SIZE*(SIZE+5)];\
2449     uint8_t * const full_mid= full + SIZE*2;\
2450     uint8_t halfH[SIZE*SIZE];\
2451     uint8_t halfV[SIZE*SIZE];\
2452     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2453     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2454     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2455     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2456 }\
2457 \
2458 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2459     int16_t tmp[SIZE*(SIZE+5)];\
2460     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2461 }\
2462 \
2463 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2464     int16_t tmp[SIZE*(SIZE+5)];\
2465     uint8_t halfH[SIZE*SIZE];\
2466     uint8_t halfHV[SIZE*SIZE];\
2467     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2468     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2469     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2470 }\
2471 \
2472 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2473     int16_t tmp[SIZE*(SIZE+5)];\
2474     uint8_t halfH[SIZE*SIZE];\
2475     uint8_t halfHV[SIZE*SIZE];\
2476     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2477     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2478     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2479 }\
2480 \
2481 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2482     uint8_t full[SIZE*(SIZE+5)];\
2483     uint8_t * const full_mid= full + SIZE*2;\
2484     int16_t tmp[SIZE*(SIZE+5)];\
2485     uint8_t halfV[SIZE*SIZE];\
2486     uint8_t halfHV[SIZE*SIZE];\
2487     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2488     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2489     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2490     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2491 }\
2492 \
2493 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2494     uint8_t full[SIZE*(SIZE+5)];\
2495     uint8_t * const full_mid= full + SIZE*2;\
2496     int16_t tmp[SIZE*(SIZE+5)];\
2497     uint8_t halfV[SIZE*SIZE];\
2498     uint8_t halfHV[SIZE*SIZE];\
2499     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2500     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2501     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2502     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2503 }\
2504
2505 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2506 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2507 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2508 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2509 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2510
2511 H264_LOWPASS(put_       , op_put, op2_put)
2512 H264_LOWPASS(avg_       , op_avg, op2_avg)
2513 H264_MC(put_, 2)
2514 H264_MC(put_, 4)
2515 H264_MC(put_, 8)
2516 H264_MC(put_, 16)
2517 H264_MC(avg_, 4)
2518 H264_MC(avg_, 8)
2519 H264_MC(avg_, 16)
2520
2521 #undef op_avg
2522 #undef op_put
2523 #undef op2_avg
2524 #undef op2_put
2525 #endif
2526
2527 #define put_h264_qpel8_mc00_c  ff_put_pixels8x8_c
2528 #define avg_h264_qpel8_mc00_c  ff_avg_pixels8x8_c
2529 #define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2530 #define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2531
2532 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2533     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2534     int i;
2535
2536     for(i=0; i<h; i++){
2537         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2538         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2539         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2540         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2541         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2542         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2543         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2544         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2545         dst+=dstStride;
2546         src+=srcStride;
2547     }
2548 }
2549
2550 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2551     put_pixels8_c(dst, src, stride, 8);
2552 }
2553 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2554     avg_pixels8_c(dst, src, stride, 8);
2555 }
2556 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2557     put_pixels16_c(dst, src, stride, 16);
2558 }
2559 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2560     avg_pixels16_c(dst, src, stride, 16);
2561 }
2562
2563 #if CONFIG_RV40_DECODER
2564 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2565     put_pixels16_xy2_c(dst, src, stride, 16);
2566 }
2567 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2568     avg_pixels16_xy2_c(dst, src, stride, 16);
2569 }
2570 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2571     put_pixels8_xy2_c(dst, src, stride, 8);
2572 }
2573 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2574     avg_pixels8_xy2_c(dst, src, stride, 8);
2575 }
2576 #endif /* CONFIG_RV40_DECODER */
2577
2578 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2579     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2580     int i;
2581
2582     for(i=0; i<w; i++){
2583         const int src_1= src[ -srcStride];
2584         const int src0 = src[0          ];
2585         const int src1 = src[  srcStride];
2586         const int src2 = src[2*srcStride];
2587         const int src3 = src[3*srcStride];
2588         const int src4 = src[4*srcStride];
2589         const int src5 = src[5*srcStride];
2590         const int src6 = src[6*srcStride];
2591         const int src7 = src[7*srcStride];
2592         const int src8 = src[8*srcStride];
2593         const int src9 = src[9*srcStride];
2594         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2595         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2596         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2597         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2598         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2599         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2600         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2601         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2602         src++;
2603         dst++;
2604     }
2605 }
2606
2607 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2608     uint8_t half[64];
2609     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2610     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2611 }
2612
2613 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2614     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2615 }
2616
2617 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2618     uint8_t half[64];
2619     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2620     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2621 }
2622
2623 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2624     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2625 }
2626
2627 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2628     uint8_t halfH[88];
2629     uint8_t halfV[64];
2630     uint8_t halfHV[64];
2631     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2632     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2633     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2634     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2635 }
2636 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2637     uint8_t halfH[88];
2638     uint8_t halfV[64];
2639     uint8_t halfHV[64];
2640     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2641     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2642     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2643     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2644 }
2645 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2646     uint8_t halfH[88];
2647     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2648     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2649 }
2650
2651 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2652     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2653     int x;
2654     const int strength= ff_h263_loop_filter_strength[qscale];
2655
2656     for(x=0; x<8; x++){
2657         int d1, d2, ad1;
2658         int p0= src[x-2*stride];
2659         int p1= src[x-1*stride];
2660         int p2= src[x+0*stride];
2661         int p3= src[x+1*stride];
2662         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2663
2664         if     (d<-2*strength) d1= 0;
2665         else if(d<-  strength) d1=-2*strength - d;
2666         else if(d<   strength) d1= d;
2667         else if(d< 2*strength) d1= 2*strength - d;
2668         else                   d1= 0;
2669
2670         p1 += d1;
2671         p2 -= d1;
2672         if(p1&256) p1= ~(p1>>31);
2673         if(p2&256) p2= ~(p2>>31);
2674
2675         src[x-1*stride] = p1;
2676         src[x+0*stride] = p2;
2677
2678         ad1= FFABS(d1)>>1;
2679
2680         d2= av_clip((p0-p3)/4, -ad1, ad1);
2681
2682         src[x-2*stride] = p0 - d2;
2683         src[x+  stride] = p3 + d2;
2684     }
2685     }
2686 }
2687
2688 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2689     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2690     int y;
2691     const int strength= ff_h263_loop_filter_strength[qscale];
2692
2693     for(y=0; y<8; y++){
2694         int d1, d2, ad1;
2695         int p0= src[y*stride-2];
2696         int p1= src[y*stride-1];
2697         int p2= src[y*stride+0];
2698         int p3= src[y*stride+1];
2699         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2700
2701         if     (d<-2*strength) d1= 0;
2702         else if(d<-  strength) d1=-2*strength - d;
2703         else if(d<   strength) d1= d;
2704         else if(d< 2*strength) d1= 2*strength - d;
2705         else                   d1= 0;
2706
2707         p1 += d1;
2708         p2 -= d1;
2709         if(p1&256) p1= ~(p1>>31);
2710         if(p2&256) p2= ~(p2>>31);
2711
2712         src[y*stride-1] = p1;
2713         src[y*stride+0] = p2;
2714
2715         ad1= FFABS(d1)>>1;
2716
2717         d2= av_clip((p0-p3)/4, -ad1, ad1);
2718
2719         src[y*stride-2] = p0 - d2;
2720         src[y*stride+1] = p3 + d2;
2721     }
2722     }
2723 }
2724
2725 static void h261_loop_filter_c(uint8_t *src, int stride){
2726     int x,y,xy,yz;
2727     int temp[64];
2728
2729     for(x=0; x<8; x++){
2730         temp[x      ] = 4*src[x           ];
2731         temp[x + 7*8] = 4*src[x + 7*stride];
2732     }
2733     for(y=1; y<7; y++){
2734         for(x=0; x<8; x++){
2735             xy = y * stride + x;
2736             yz = y * 8 + x;
2737             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2738         }
2739     }
2740
2741     for(y=0; y<8; y++){
2742         src[  y*stride] = (temp[  y*8] + 2)>>2;
2743         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2744         for(x=1; x<7; x++){
2745             xy = y * stride + x;
2746             yz = y * 8 + x;
2747             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2748         }
2749     }
2750 }
2751
2752 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2753 {
2754     int s, i;
2755
2756     s = 0;
2757     for(i=0;i<h;i++) {
2758         s += abs(pix1[0] - pix2[0]);
2759         s += abs(pix1[1] - pix2[1]);
2760         s += abs(pix1[2] - pix2[2]);
2761         s += abs(pix1[3] - pix2[3]);
2762         s += abs(pix1[4] - pix2[4]);
2763         s += abs(pix1[5] - pix2[5]);
2764         s += abs(pix1[6] - pix2[6]);
2765         s += abs(pix1[7] - pix2[7]);
2766         s += abs(pix1[8] - pix2[8]);
2767         s += abs(pix1[9] - pix2[9]);
2768         s += abs(pix1[10] - pix2[10]);
2769         s += abs(pix1[11] - pix2[11]);
2770         s += abs(pix1[12] - pix2[12]);
2771         s += abs(pix1[13] - pix2[13]);
2772         s += abs(pix1[14] - pix2[14]);
2773         s += abs(pix1[15] - pix2[15]);
2774         pix1 += line_size;
2775         pix2 += line_size;
2776     }
2777     return s;
2778 }
2779
2780 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2781 {
2782     int s, i;
2783
2784     s = 0;
2785     for(i=0;i<h;i++) {
2786         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2787         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2788         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2789         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2790         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2791         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2792         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2793         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2794         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2795         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2796         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2797         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2798         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2799         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2800         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2801         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2802         pix1 += line_size;
2803         pix2 += line_size;
2804     }
2805     return s;
2806 }
2807
2808 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2809 {
2810     int s, i;
2811     uint8_t *pix3 = pix2 + line_size;
2812
2813     s = 0;
2814     for(i=0;i<h;i++) {
2815         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2816         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2817         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2818         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2819         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2820         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2821         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2822         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2823         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2824         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2825         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2826         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2827         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2828         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2829         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2830         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2831         pix1 += line_size;
2832         pix2 += line_size;
2833         pix3 += line_size;
2834     }
2835     return s;
2836 }
2837
2838 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2839 {
2840     int s, i;
2841     uint8_t *pix3 = pix2 + line_size;
2842
2843     s = 0;
2844     for(i=0;i<h;i++) {
2845         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2846         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2847         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2848         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2849         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2850         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2851         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2852         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2853         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2854         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2855         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2856         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2857         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2858         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2859         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2860         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2861         pix1 += line_size;
2862         pix2 += line_size;
2863         pix3 += line_size;
2864     }
2865     return s;
2866 }
2867
2868 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2869 {
2870     int s, i;
2871
2872     s = 0;
2873     for(i=0;i<h;i++) {
2874         s += abs(pix1[0] - pix2[0]);
2875         s += abs(pix1[1] - pix2[1]);
2876         s += abs(pix1[2] - pix2[2]);
2877         s += abs(pix1[3] - pix2[3]);
2878         s += abs(pix1[4] - pix2[4]);
2879         s += abs(pix1[5] - pix2[5]);
2880         s += abs(pix1[6] - pix2[6]);
2881         s += abs(pix1[7] - pix2[7]);
2882         pix1 += line_size;
2883         pix2 += line_size;
2884     }
2885     return s;
2886 }
2887
2888 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2889 {
2890     int s, i;
2891
2892     s = 0;
2893     for(i=0;i<h;i++) {
2894         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2895         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2896         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2897         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2898         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2899         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2900         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2901         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2902         pix1 += line_size;
2903         pix2 += line_size;
2904     }
2905     return s;
2906 }
2907
2908 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2909 {
2910     int s, i;
2911     uint8_t *pix3 = pix2 + line_size;
2912
2913     s = 0;
2914     for(i=0;i<h;i++) {
2915         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2916         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2917         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2918         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2919         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2920         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2921         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2922         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2923         pix1 += line_size;
2924         pix2 += line_size;
2925         pix3 += line_size;
2926     }
2927     return s;
2928 }
2929
2930 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2931 {
2932     int s, i;
2933     uint8_t *pix3 = pix2 + line_size;
2934
2935     s = 0;
2936     for(i=0;i<h;i++) {
2937         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2938         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2939         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2940         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2941         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2942         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2943         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2944         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2945         pix1 += line_size;
2946         pix2 += line_size;
2947         pix3 += line_size;
2948     }
2949     return s;
2950 }
2951
2952 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2953     MpegEncContext *c = v;
2954     int score1=0;
2955     int score2=0;
2956     int x,y;
2957
2958     for(y=0; y<h; y++){
2959         for(x=0; x<16; x++){
2960             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2961         }
2962         if(y+1<h){
2963             for(x=0; x<15; x++){
2964                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
2965                              - s1[x+1] + s1[x+1+stride])
2966                         -FFABS(  s2[x  ] - s2[x  +stride]
2967                              - s2[x+1] + s2[x+1+stride]);
2968             }
2969         }
2970         s1+= stride;
2971         s2+= stride;
2972     }
2973
2974     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
2975     else  return score1 + FFABS(score2)*8;
2976 }
2977
2978 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2979     MpegEncContext *c = v;
2980     int score1=0;
2981     int score2=0;
2982     int x,y;
2983
2984     for(y=0; y<h; y++){
2985         for(x=0; x<8; x++){
2986             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2987         }
2988         if(y+1<h){
2989             for(x=0; x<7; x++){
2990                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
2991                              - s1[x+1] + s1[x+1+stride])
2992                         -FFABS(  s2[x  ] - s2[x  +stride]
2993                              - s2[x+1] + s2[x+1+stride]);
2994             }
2995         }
2996         s1+= stride;
2997         s2+= stride;
2998     }
2999
3000     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3001     else  return score1 + FFABS(score2)*8;
3002 }
3003
3004 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3005     int i;
3006     unsigned int sum=0;
3007
3008     for(i=0; i<8*8; i++){
3009         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3010         int w= weight[i];
3011         b>>= RECON_SHIFT;
3012         assert(-512<b && b<512);
3013
3014         sum += (w*b)*(w*b)>>4;
3015     }
3016     return sum>>2;
3017 }
3018
3019 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3020     int i;
3021
3022     for(i=0; i<8*8; i++){
3023         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3024     }
3025 }
3026
3027 /**
3028  * permutes an 8x8 block.
3029  * @param block the block which will be permuted according to the given permutation vector
3030  * @param permutation the permutation vector
3031  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3032  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3033  *                  (inverse) permutated to scantable order!
3034  */
3035 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3036 {
3037     int i;
3038     DCTELEM temp[64];
3039
3040     if(last<=0) return;
3041     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3042
3043     for(i=0; i<=last; i++){
3044         const int j= scantable[i];
3045         temp[j]= block[j];
3046         block[j]=0;
3047     }
3048
3049     for(i=0; i<=last; i++){
3050         const int j= scantable[i];
3051         const int perm_j= permutation[j];
3052         block[perm_j]= temp[j];
3053     }
3054 }
3055
3056 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3057     return 0;
3058 }
3059
3060 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3061     int i;
3062
3063     memset(cmp, 0, sizeof(void*)*6);
3064
3065     for(i=0; i<6; i++){
3066         switch(type&0xFF){
3067         case FF_CMP_SAD:
3068             cmp[i]= c->sad[i];
3069             break;
3070         case FF_CMP_SATD:
3071             cmp[i]= c->hadamard8_diff[i];
3072             break;
3073         case FF_CMP_SSE:
3074             cmp[i]= c->sse[i];
3075             break;
3076         case FF_CMP_DCT:
3077             cmp[i]= c->dct_sad[i];
3078             break;
3079         case FF_CMP_DCT264:
3080             cmp[i]= c->dct264_sad[i];
3081             break;
3082         case FF_CMP_DCTMAX:
3083             cmp[i]= c->dct_max[i];
3084             break;
3085         case FF_CMP_PSNR:
3086             cmp[i]= c->quant_psnr[i];
3087             break;
3088         case FF_CMP_BIT:
3089             cmp[i]= c->bit[i];
3090             break;
3091         case FF_CMP_RD:
3092             cmp[i]= c->rd[i];
3093             break;
3094         case FF_CMP_VSAD:
3095             cmp[i]= c->vsad[i];
3096             break;
3097         case FF_CMP_VSSE:
3098             cmp[i]= c->vsse[i];
3099             break;
3100         case FF_CMP_ZERO:
3101             cmp[i]= zero_cmp;
3102             break;
3103         case FF_CMP_NSSE:
3104             cmp[i]= c->nsse[i];
3105             break;
3106 #if CONFIG_DWT
3107         case FF_CMP_W53:
3108             cmp[i]= c->w53[i];
3109             break;
3110         case FF_CMP_W97:
3111             cmp[i]= c->w97[i];
3112             break;
3113 #endif
3114         default:
3115             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3116         }
3117     }
3118 }
3119
3120 static void clear_block_c(DCTELEM *block)
3121 {
3122     memset(block, 0, sizeof(DCTELEM)*64);
3123 }
3124
3125 /**
3126  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3127  */
3128 static void clear_blocks_c(DCTELEM *blocks)
3129 {
3130     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3131 }
3132
3133 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3134     long i;
3135     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3136         long a = *(long*)(src+i);
3137         long b = *(long*)(dst+i);
3138         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3139     }
3140     for(; i<w; i++)
3141         dst[i+0] += src[i+0];
3142 }
3143
3144 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3145     long i;
3146     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3147         long a = *(long*)(src1+i);
3148         long b = *(long*)(src2+i);
3149         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3150     }
3151     for(; i<w; i++)
3152         dst[i] = src1[i]+src2[i];
3153 }
3154
3155 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3156     long i;
3157 #if !HAVE_FAST_UNALIGNED
3158     if((long)src2 & (sizeof(long)-1)){
3159         for(i=0; i+7<w; i+=8){
3160             dst[i+0] = src1[i+0]-src2[i+0];
3161             dst[i+1] = src1[i+1]-src2[i+1];
3162             dst[i+2] = src1[i+2]-src2[i+2];
3163             dst[i+3] = src1[i+3]-src2[i+3];
3164             dst[i+4] = src1[i+4]-src2[i+4];
3165             dst[i+5] = src1[i+5]-src2[i+5];
3166             dst[i+6] = src1[i+6]-src2[i+6];
3167             dst[i+7] = src1[i+7]-src2[i+7];
3168         }
3169     }else
3170 #endif
3171     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3172         long a = *(long*)(src1+i);
3173         long b = *(long*)(src2+i);
3174         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3175     }
3176     for(; i<w; i++)
3177         dst[i+0] = src1[i+0]-src2[i+0];
3178 }
3179
3180 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3181     int i;
3182     uint8_t l, lt;
3183
3184     l= *left;
3185     lt= *left_top;
3186
3187     for(i=0; i<w; i++){
3188         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3189         lt= src1[i];
3190         dst[i]= l;
3191     }
3192
3193     *left= l;
3194     *left_top= lt;
3195 }
3196
3197 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3198     int i;
3199     uint8_t l, lt;
3200
3201     l= *left;
3202     lt= *left_top;
3203
3204     for(i=0; i<w; i++){
3205         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3206         lt= src1[i];
3207         l= src2[i];
3208         dst[i]= l - pred;
3209     }
3210
3211     *left= l;
3212     *left_top= lt;
3213 }
3214
3215 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3216     int i;
3217
3218     for(i=0; i<w-1; i++){
3219         acc+= src[i];
3220         dst[i]= acc;
3221         i++;
3222         acc+= src[i];
3223         dst[i]= acc;
3224     }
3225
3226     for(; i<w; i++){
3227         acc+= src[i];
3228         dst[i]= acc;
3229     }
3230
3231     return acc;
3232 }
3233
3234 #if HAVE_BIGENDIAN
3235 #define B 3
3236 #define G 2
3237 #define R 1
3238 #define A 0
3239 #else
3240 #define B 0
3241 #define G 1
3242 #define R 2
3243 #define A 3
3244 #endif
3245 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3246     int i;
3247     int r,g,b,a;
3248     r= *red;
3249     g= *green;
3250     b= *blue;
3251     a= *alpha;
3252
3253     for(i=0; i<w; i++){
3254         b+= src[4*i+B];
3255         g+= src[4*i+G];
3256         r+= src[4*i+R];
3257         a+= src[4*i+A];
3258
3259         dst[4*i+B]= b;
3260         dst[4*i+G]= g;
3261         dst[4*i+R]= r;
3262         dst[4*i+A]= a;
3263     }
3264
3265     *red= r;
3266     *green= g;
3267     *blue= b;
3268     *alpha= a;
3269 }
3270 #undef B
3271 #undef G
3272 #undef R
3273 #undef A
3274
3275 #define BUTTERFLY2(o1,o2,i1,i2) \
3276 o1= (i1)+(i2);\
3277 o2= (i1)-(i2);
3278
3279 #define BUTTERFLY1(x,y) \
3280 {\
3281     int a,b;\
3282     a= x;\
3283     b= y;\
3284     x= a+b;\
3285     y= a-b;\
3286 }
3287
3288 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3289
3290 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3291     int i;
3292     int temp[64];
3293     int sum=0;
3294
3295     assert(h==8);
3296
3297     for(i=0; i<8; i++){
3298         //FIXME try pointer walks
3299         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3300         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3301         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3302         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3303
3304         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3305         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3306         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3307         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3308
3309         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3310         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3311         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3312         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3313     }
3314
3315     for(i=0; i<8; i++){
3316         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3317         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3318         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3319         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3320
3321         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3322         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3323         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3324         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3325
3326         sum +=
3327              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3328             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3329             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3330             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3331     }
3332 #if 0
3333 static int maxi=0;
3334 if(sum>maxi){
3335     maxi=sum;
3336     printf("MAX:%d\n", maxi);
3337 }
3338 #endif
3339     return sum;
3340 }
3341
3342 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3343     int i;
3344     int temp[64];
3345     int sum=0;
3346
3347     assert(h==8);
3348
3349     for(i=0; i<8; i++){
3350         //FIXME try pointer walks
3351         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3352         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3353         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3354         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3355
3356         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3357         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3358         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3359         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3360
3361         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3362         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3363         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3364         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3365     }
3366
3367     for(i=0; i<8; i++){
3368         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3369         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3370         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3371         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3372
3373         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3374         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3375         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3376         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3377
3378         sum +=
3379              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3380             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3381             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3382             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3383     }
3384
3385     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3386
3387     return sum;
3388 }
3389
3390 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3391     MpegEncContext * const s= (MpegEncContext *)c;
3392     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3393
3394     assert(h==8);
3395
3396     s->dsp.diff_pixels(temp, src1, src2, stride);
3397     s->dsp.fdct(temp);
3398     return s->dsp.sum_abs_dctelem(temp);
3399 }
3400
3401 #if CONFIG_GPL
3402 #define DCT8_1D {\
3403     const int s07 = SRC(0) + SRC(7);\
3404     const int s16 = SRC(1) + SRC(6);\
3405     const int s25 = SRC(2) + SRC(5);\
3406     const int s34 = SRC(3) + SRC(4);\
3407     const int a0 = s07 + s34;\
3408     const int a1 = s16 + s25;\
3409     const int a2 = s07 - s34;\
3410     const int a3 = s16 - s25;\
3411     const int d07 = SRC(0) - SRC(7);\
3412     const int d16 = SRC(1) - SRC(6);\
3413     const int d25 = SRC(2) - SRC(5);\
3414     const int d34 = SRC(3) - SRC(4);\
3415     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3416     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3417     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3418     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3419     DST(0,  a0 + a1     ) ;\
3420     DST(1,  a4 + (a7>>2)) ;\
3421     DST(2,  a2 + (a3>>1)) ;\
3422     DST(3,  a5 + (a6>>2)) ;\
3423     DST(4,  a0 - a1     ) ;\
3424     DST(5,  a6 - (a5>>2)) ;\
3425     DST(6, (a2>>1) - a3 ) ;\
3426     DST(7, (a4>>2) - a7 ) ;\
3427 }
3428
3429 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3430     MpegEncContext * const s= (MpegEncContext *)c;
3431     DCTELEM dct[8][8];
3432     int i;
3433     int sum=0;
3434
3435     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3436
3437 #define SRC(x) dct[i][x]
3438 #define DST(x,v) dct[i][x]= v
3439     for( i = 0; i < 8; i++ )
3440         DCT8_1D
3441 #undef SRC
3442 #undef DST
3443
3444 #define SRC(x) dct[x][i]
3445 #define DST(x,v) sum += FFABS(v)
3446     for( i = 0; i < 8; i++ )
3447         DCT8_1D
3448 #undef SRC
3449 #undef DST
3450     return sum;
3451 }
3452 #endif
3453
3454 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3455     MpegEncContext * const s= (MpegEncContext *)c;
3456     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3457     int sum=0, i;
3458
3459     assert(h==8);
3460
3461     s->dsp.diff_pixels(temp, src1, src2, stride);
3462     s->dsp.fdct(temp);
3463
3464     for(i=0; i<64; i++)
3465         sum= FFMAX(sum, FFABS(temp[i]));
3466
3467     return sum;
3468 }
3469
3470 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3471     MpegEncContext * const s= (MpegEncContext *)c;
3472     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3473     DCTELEM * const bak = temp+64;
3474     int sum=0, i;
3475
3476     assert(h==8);
3477     s->mb_intra=0;
3478
3479     s->dsp.diff_pixels(temp, src1, src2, stride);
3480
3481     memcpy(bak, temp, 64*sizeof(DCTELEM));
3482
3483     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3484     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3485     ff_simple_idct(temp); //FIXME
3486
3487     for(i=0; i<64; i++)
3488         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3489
3490     return sum;
3491 }
3492
3493 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3494     MpegEncContext * const s= (MpegEncContext *)c;
3495     const uint8_t *scantable= s->intra_scantable.permutated;
3496     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3497     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3498     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3499     int i, last, run, bits, level, distortion, start_i;
3500     const int esc_length= s->ac_esc_length;
3501     uint8_t * length;
3502     uint8_t * last_length;
3503
3504     assert(h==8);
3505
3506     copy_block8(lsrc1, src1, 8, stride, 8);
3507     copy_block8(lsrc2, src2, 8, stride, 8);
3508
3509     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3510
3511     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3512
3513     bits=0;
3514
3515     if (s->mb_intra) {
3516         start_i = 1;
3517         length     = s->intra_ac_vlc_length;
3518         last_length= s->intra_ac_vlc_last_length;
3519         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3520     } else {
3521         start_i = 0;
3522         length     = s->inter_ac_vlc_length;
3523         last_length= s->inter_ac_vlc_last_length;
3524     }
3525
3526     if(last>=start_i){
3527         run=0;
3528         for(i=start_i; i<last; i++){
3529             int j= scantable[i];
3530             level= temp[j];
3531
3532             if(level){
3533                 level+=64;
3534                 if((level&(~127)) == 0){
3535                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3536                 }else
3537                     bits+= esc_length;
3538                 run=0;
3539             }else
3540                 run++;
3541         }
3542         i= scantable[last];
3543
3544         level= temp[i] + 64;
3545
3546         assert(level - 64);
3547
3548         if((level&(~127)) == 0){
3549             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3550         }else
3551             bits+= esc_length;
3552
3553     }
3554
3555     if(last>=0){
3556         if(s->mb_intra)
3557             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3558         else
3559             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3560     }
3561
3562     s->dsp.idct_add(lsrc2, 8, temp);
3563
3564     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3565
3566     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3567 }
3568
3569 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3570     MpegEncContext * const s= (MpegEncContext *)c;
3571     const uint8_t *scantable= s->intra_scantable.permutated;
3572     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3573     int i, last, run, bits, level, start_i;
3574     const int esc_length= s->ac_esc_length;
3575     uint8_t * length;
3576     uint8_t * last_length;
3577
3578     assert(h==8);
3579
3580     s->dsp.diff_pixels(temp, src1, src2, stride);
3581
3582     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3583
3584     bits=0;
3585
3586     if (s->mb_intra) {
3587         start_i = 1;
3588         length     = s->intra_ac_vlc_length;
3589         last_length= s->intra_ac_vlc_last_length;
3590         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3591     } else {
3592         start_i = 0;
3593         length     = s->inter_ac_vlc_length;
3594         last_length= s->inter_ac_vlc_last_length;
3595     }
3596
3597     if(last>=start_i){
3598         run=0;
3599         for(i=start_i; i<last; i++){
3600             int j= scantable[i];
3601             level= temp[j];
3602
3603             if(level){
3604                 level+=64;
3605                 if((level&(~127)) == 0){
3606                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3607                 }else
3608                     bits+= esc_length;
3609                 run=0;
3610             }else
3611                 run++;
3612         }
3613         i= scantable[last];
3614
3615         level= temp[i] + 64;
3616
3617         assert(level - 64);
3618
3619         if((level&(~127)) == 0){
3620             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3621         }else
3622             bits+= esc_length;
3623     }
3624
3625     return bits;
3626 }
3627
3628 #define VSAD_INTRA(size) \
3629 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3630     int score=0;                                                                                            \
3631     int x,y;                                                                                                \
3632                                                                                                             \
3633     for(y=1; y<h; y++){                                                                                     \
3634         for(x=0; x<size; x+=4){                                                                             \
3635             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3636                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3637         }                                                                                                   \
3638         s+= stride;                                                                                         \
3639     }                                                                                                       \
3640                                                                                                             \
3641     return score;                                                                                           \
3642 }
3643 VSAD_INTRA(8)
3644 VSAD_INTRA(16)
3645
3646 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3647     int score=0;
3648     int x,y;
3649
3650     for(y=1; y<h; y++){
3651         for(x=0; x<16; x++){
3652             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3653         }
3654         s1+= stride;
3655         s2+= stride;
3656     }
3657
3658     return score;
3659 }
3660
3661 #define SQ(a) ((a)*(a))
3662 #define VSSE_INTRA(size) \
3663 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3664     int score=0;                                                                                            \
3665     int x,y;                                                                                                \
3666                                                                                                             \
3667     for(y=1; y<h; y++){                                                                                     \
3668         for(x=0; x<size; x+=4){                                                                               \
3669             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3670                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3671         }                                                                                                   \
3672         s+= stride;                                                                                         \
3673     }                                                                                                       \
3674                                                                                                             \
3675     return score;                                                                                           \
3676 }
3677 VSSE_INTRA(8)
3678 VSSE_INTRA(16)
3679
3680 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3681     int score=0;
3682     int x,y;
3683
3684     for(y=1; y<h; y++){
3685         for(x=0; x<16; x++){
3686             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3687         }
3688         s1+= stride;
3689         s2+= stride;
3690     }
3691
3692     return score;
3693 }
3694
3695 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3696                                int size){
3697     int score=0;
3698     int i;
3699     for(i=0; i<size; i++)
3700         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3701     return score;
3702 }
3703
3704 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3705 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3706 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3707 #if CONFIG_GPL
3708 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3709 #endif
3710 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3711 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3712 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3713 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3714
3715 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3716     int i;
3717     for(i=0; i<len; i++)
3718         dst[i] = src0[i] * src1[i];
3719 }
3720
3721 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3722     int i;
3723     src1 += len-1;
3724     for(i=0; i<len; i++)
3725         dst[i] = src0[i] * src1[-i];
3726 }
3727
3728 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3729     int i;
3730     for(i=0; i<len; i++)
3731         dst[i] = src0[i] * src1[i] + src2[i];
3732 }
3733
3734 static void vector_fmul_window_c(float *dst, const float *src0,
3735                                  const float *src1, const float *win, int len)
3736 {
3737     int i,j;
3738     dst += len;
3739     win += len;
3740     src0+= len;
3741     for(i=-len, j=len-1; i<0; i++, j--) {
3742         float s0 = src0[i];
3743         float s1 = src1[j];
3744         float wi = win[i];
3745         float wj = win[j];
3746         dst[i] = s0*wj - s1*wi;
3747         dst[j] = s0*wi + s1*wj;
3748     }
3749 }
3750
3751 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3752                                  int len)
3753 {
3754     int i;
3755     for (i = 0; i < len; i++)
3756         dst[i] = src[i] * mul;
3757 }
3758
3759 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3760                                       const float **sv, float mul, int len)
3761 {
3762     int i;
3763     for (i = 0; i < len; i += 2, sv++) {
3764         dst[i  ] = src[i  ] * sv[0][0] * mul;
3765         dst[i+1] = src[i+1] * sv[0][1] * mul;
3766     }
3767 }
3768
3769 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3770                                       const float **sv, float mul, int len)
3771 {
3772     int i;
3773     for (i = 0; i < len; i += 4, sv++) {
3774         dst[i  ] = src[i  ] * sv[0][0] * mul;
3775         dst[i+1] = src[i+1] * sv[0][1] * mul;
3776         dst[i+2] = src[i+2] * sv[0][2] * mul;
3777         dst[i+3] = src[i+3] * sv[0][3] * mul;
3778     }
3779 }
3780
3781 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3782                                int len)
3783 {
3784     int i;
3785     for (i = 0; i < len; i += 2, sv++) {
3786         dst[i  ] = sv[0][0] * mul;
3787         dst[i+1] = sv[0][1] * mul;
3788     }
3789 }
3790
3791 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3792                                int len)
3793 {
3794     int i;
3795     for (i = 0; i < len; i += 4, sv++) {
3796         dst[i  ] = sv[0][0] * mul;
3797         dst[i+1] = sv[0][1] * mul;
3798         dst[i+2] = sv[0][2] * mul;
3799         dst[i+3] = sv[0][3] * mul;
3800     }
3801 }
3802
3803 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3804                                 int len)
3805 {
3806     int i;
3807     for (i = 0; i < len; i++) {
3808         float t = v1[i] - v2[i];
3809         v1[i] += v2[i];
3810         v2[i] = t;
3811     }
3812 }
3813
3814 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3815 {
3816     float p = 0.0;
3817     int i;
3818
3819     for (i = 0; i < len; i++)
3820         p += v1[i] * v2[i];
3821
3822     return p;
3823 }
3824
3825 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3826                    uint32_t maxi, uint32_t maxisign)
3827 {
3828
3829     if(a > mini) return mini;
3830     else if((a^(1U<<31)) > maxisign) return maxi;
3831     else return a;
3832 }
3833
3834 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3835     int i;
3836     uint32_t mini = *(uint32_t*)min;
3837     uint32_t maxi = *(uint32_t*)max;
3838     uint32_t maxisign = maxi ^ (1U<<31);
3839     uint32_t *dsti = (uint32_t*)dst;
3840     const uint32_t *srci = (const uint32_t*)src;
3841     for(i=0; i<len; i+=8) {
3842         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3843         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3844         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3845         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3846         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3847         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3848         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3849         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3850     }
3851 }
3852 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3853     int i;
3854     if(min < 0 && max > 0) {
3855         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3856     } else {
3857         for(i=0; i < len; i+=8) {
3858             dst[i    ] = av_clipf(src[i    ], min, max);
3859             dst[i + 1] = av_clipf(src[i + 1], min, max);
3860             dst[i + 2] = av_clipf(src[i + 2], min, max);
3861             dst[i + 3] = av_clipf(src[i + 3], min, max);
3862             dst[i + 4] = av_clipf(src[i + 4], min, max);
3863             dst[i + 5] = av_clipf(src[i + 5], min, max);
3864             dst[i + 6] = av_clipf(src[i + 6], min, max);
3865             dst[i + 7] = av_clipf(src[i + 7], min, max);
3866         }
3867     }
3868 }
3869
3870 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3871 {
3872     int res = 0;
3873
3874     while (order--)
3875         res += (*v1++ * *v2++) >> shift;
3876
3877     return res;
3878 }
3879
3880 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3881 {
3882     int res = 0;
3883     while (order--) {
3884         res   += *v1 * *v2++;
3885         *v1++ += mul * *v3++;
3886     }
3887     return res;
3888 }
3889
3890 static void apply_window_int16_c(int16_t *output, const int16_t *input,
3891                                  const int16_t *window, unsigned int len)
3892 {
3893     int i;
3894     int len2 = len >> 1;
3895
3896     for (i = 0; i < len2; i++) {
3897         int16_t w       = window[i];
3898         output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
3899         output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
3900     }
3901 }
3902
3903 #define W0 2048
3904 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3905 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3906 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3907 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3908 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3909 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3910 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3911
3912 static void wmv2_idct_row(short * b)
3913 {
3914     int s1,s2;
3915     int a0,a1,a2,a3,a4,a5,a6,a7;
3916     /*step 1*/
3917     a1 = W1*b[1]+W7*b[7];
3918     a7 = W7*b[1]-W1*b[7];
3919     a5 = W5*b[5]+W3*b[3];
3920     a3 = W3*b[5]-W5*b[3];
3921     a2 = W2*b[2]+W6*b[6];
3922     a6 = W6*b[2]-W2*b[6];
3923     a0 = W0*b[0]+W0*b[4];
3924     a4 = W0*b[0]-W0*b[4];
3925     /*step 2*/
3926     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3927     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3928     /*step 3*/
3929     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3930     b[1] = (a4+a6 +s1   + (1<<7))>>8;
3931     b[2] = (a4-a6 +s2   + (1<<7))>>8;
3932     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3933     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3934     b[5] = (a4-a6 -s2   + (1<<7))>>8;
3935     b[6] = (a4+a6 -s1   + (1<<7))>>8;
3936     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3937 }
3938 static void wmv2_idct_col(short * b)
3939 {
3940     int s1,s2;
3941     int a0,a1,a2,a3,a4,a5,a6,a7;
3942     /*step 1, with extended precision*/
3943     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3944     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3945     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3946     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3947     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3948     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3949     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
3950     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
3951     /*step 2*/
3952     s1 = (181*(a1-a5+a7-a3)+128)>>8;
3953     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3954     /*step 3*/
3955     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3956     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
3957     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
3958     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3959
3960     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3961     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
3962     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
3963     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3964 }
3965 void ff_wmv2_idct_c(short * block){
3966     int i;
3967
3968     for(i=0;i<64;i+=8){
3969         wmv2_idct_row(block+i);
3970     }
3971     for(i=0;i<8;i++){
3972         wmv2_idct_col(block+i);
3973     }
3974 }
3975 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3976  converted */
3977 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3978 {
3979     ff_wmv2_idct_c(block);
3980     ff_put_pixels_clamped_c(block, dest, line_size);
3981 }
3982 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3983 {
3984     ff_wmv2_idct_c(block);
3985     ff_add_pixels_clamped_c(block, dest, line_size);
3986 }
3987 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3988 {
3989     j_rev_dct (block);
3990     ff_put_pixels_clamped_c(block, dest, line_size);
3991 }
3992 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3993 {
3994     j_rev_dct (block);
3995     ff_add_pixels_clamped_c(block, dest, line_size);
3996 }
3997
3998 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3999 {
4000     j_rev_dct4 (block);
4001     put_pixels_clamped4_c(block, dest, line_size);
4002 }
4003 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4004 {
4005     j_rev_dct4 (block);
4006     add_pixels_clamped4_c(block, dest, line_size);
4007 }
4008
4009 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4010 {
4011     j_rev_dct2 (block);
4012     put_pixels_clamped2_c(block, dest, line_size);
4013 }
4014 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4015 {
4016     j_rev_dct2 (block);
4017     add_pixels_clamped2_c(block, dest, line_size);
4018 }
4019
4020 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4021 {
4022     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4023
4024     dest[0] = cm[(block[0] + 4)>>3];
4025 }
4026 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4027 {
4028     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4029
4030     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4031 }
4032
4033 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4034
4035 /* init static data */
4036 av_cold void dsputil_static_init(void)
4037 {
4038     int i;
4039
4040     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4041     for(i=0;i<MAX_NEG_CROP;i++) {
4042         ff_cropTbl[i] = 0;
4043         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4044     }
4045
4046     for(i=0;i<512;i++) {
4047         ff_squareTbl[i] = (i - 256) * (i - 256);
4048     }
4049
4050     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4051 }
4052
4053 int ff_check_alignment(void){
4054     static int did_fail=0;
4055     DECLARE_ALIGNED(16, int, aligned);
4056
4057     if((intptr_t)&aligned & 15){
4058         if(!did_fail){
4059 #if HAVE_MMX || HAVE_ALTIVEC
4060             av_log(NULL, AV_LOG_ERROR,
4061                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4062                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4063                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4064                 "Do not report crashes to Libav developers.\n");
4065 #endif
4066             did_fail=1;
4067         }
4068         return -1;
4069     }
4070     return 0;
4071 }
4072
4073 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4074 {
4075     int i;
4076
4077     ff_check_alignment();
4078
4079 #if CONFIG_ENCODERS
4080     if(avctx->dct_algo==FF_DCT_FASTINT) {
4081         c->fdct = fdct_ifast;
4082         c->fdct248 = fdct_ifast248;
4083     }
4084     else if(avctx->dct_algo==FF_DCT_FAAN) {
4085         c->fdct = ff_faandct;
4086         c->fdct248 = ff_faandct248;
4087     }
4088     else {
4089         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4090         c->fdct248 = ff_fdct248_islow;
4091     }
4092 #endif //CONFIG_ENCODERS
4093
4094     if(avctx->lowres==1){
4095         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4096             c->idct_put= ff_jref_idct4_put;
4097             c->idct_add= ff_jref_idct4_add;
4098         }else{
4099             c->idct_put= ff_h264_lowres_idct_put_c;
4100             c->idct_add= ff_h264_lowres_idct_add_c;
4101         }
4102         c->idct    = j_rev_dct4;
4103         c->idct_permutation_type= FF_NO_IDCT_PERM;
4104     }else if(avctx->lowres==2){
4105         c->idct_put= ff_jref_idct2_put;
4106         c->idct_add= ff_jref_idct2_add;
4107         c->idct    = j_rev_dct2;
4108         c->idct_permutation_type= FF_NO_IDCT_PERM;
4109     }else if(avctx->lowres==3){
4110         c->idct_put= ff_jref_idct1_put;
4111         c->idct_add= ff_jref_idct1_add;
4112         c->idct    = j_rev_dct1;
4113         c->idct_permutation_type= FF_NO_IDCT_PERM;
4114     }else{
4115         if(avctx->idct_algo==FF_IDCT_INT){
4116             c->idct_put= ff_jref_idct_put;
4117             c->idct_add= ff_jref_idct_add;
4118             c->idct    = j_rev_dct;
4119             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4120         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4121                 avctx->idct_algo==FF_IDCT_VP3){
4122             c->idct_put= ff_vp3_idct_put_c;
4123             c->idct_add= ff_vp3_idct_add_c;
4124             c->idct    = ff_vp3_idct_c;
4125             c->idct_permutation_type= FF_NO_IDCT_PERM;
4126         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4127             c->idct_put= ff_wmv2_idct_put_c;
4128             c->idct_add= ff_wmv2_idct_add_c;
4129             c->idct    = ff_wmv2_idct_c;
4130             c->idct_permutation_type= FF_NO_IDCT_PERM;
4131         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4132             c->idct_put= ff_faanidct_put;
4133             c->idct_add= ff_faanidct_add;
4134             c->idct    = ff_faanidct;
4135             c->idct_permutation_type= FF_NO_IDCT_PERM;
4136         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4137             c->idct_put= ff_ea_idct_put_c;
4138             c->idct_permutation_type= FF_NO_IDCT_PERM;
4139         }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4140             c->idct     = ff_bink_idct_c;
4141             c->idct_add = ff_bink_idct_add_c;
4142             c->idct_put = ff_bink_idct_put_c;
4143             c->idct_permutation_type = FF_NO_IDCT_PERM;
4144         }else{ //accurate/default
4145             c->idct_put= ff_simple_idct_put;
4146             c->idct_add= ff_simple_idct_add;
4147             c->idct    = ff_simple_idct;
4148             c->idct_permutation_type= FF_NO_IDCT_PERM;
4149         }
4150     }
4151
4152     c->get_pixels = get_pixels_c;
4153     c->diff_pixels = diff_pixels_c;
4154     c->put_pixels_clamped = ff_put_pixels_clamped_c;
4155     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
4156     c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4157     c->add_pixels_clamped = ff_add_pixels_clamped_c;
4158     c->add_pixels8 = add_pixels8_c;
4159     c->add_pixels4 = add_pixels4_c;
4160     c->sum_abs_dctelem = sum_abs_dctelem_c;
4161     c->emulated_edge_mc = ff_emulated_edge_mc;
4162     c->gmc1 = gmc1_c;
4163     c->gmc = ff_gmc_c;
4164     c->clear_block = clear_block_c;
4165     c->clear_blocks = clear_blocks_c;
4166     c->pix_sum = pix_sum_c;
4167     c->pix_norm1 = pix_norm1_c;
4168
4169     c->fill_block_tab[0] = fill_block16_c;
4170     c->fill_block_tab[1] = fill_block8_c;
4171     c->scale_block = scale_block_c;
4172
4173     /* TODO [0] 16  [1] 8 */
4174     c->pix_abs[0][0] = pix_abs16_c;
4175     c->pix_abs[0][1] = pix_abs16_x2_c;
4176     c->pix_abs[0][2] = pix_abs16_y2_c;
4177     c->pix_abs[0][3] = pix_abs16_xy2_c;
4178     c->pix_abs[1][0] = pix_abs8_c;
4179     c->pix_abs[1][1] = pix_abs8_x2_c;
4180     c->pix_abs[1][2] = pix_abs8_y2_c;
4181     c->pix_abs[1][3] = pix_abs8_xy2_c;
4182
4183 #define dspfunc(PFX, IDX, NUM) \
4184     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4185     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4186     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4187     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4188
4189     dspfunc(put, 0, 16);
4190     dspfunc(put_no_rnd, 0, 16);
4191     dspfunc(put, 1, 8);
4192     dspfunc(put_no_rnd, 1, 8);
4193     dspfunc(put, 2, 4);
4194     dspfunc(put, 3, 2);
4195
4196     dspfunc(avg, 0, 16);
4197     dspfunc(avg_no_rnd, 0, 16);
4198     dspfunc(avg, 1, 8);
4199     dspfunc(avg_no_rnd, 1, 8);
4200     dspfunc(avg, 2, 4);
4201     dspfunc(avg, 3, 2);
4202 #undef dspfunc
4203
4204     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4205     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4206
4207     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4208     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4209     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4210     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4211     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4212     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4213     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4214     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4215     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4216
4217     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4218     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4219     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4220     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4221     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4222     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4223     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4224     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4225     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4226
4227 #define dspfunc(PFX, IDX, NUM) \
4228     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4229     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4230     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4231     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4232     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4233     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4234     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4235     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4236     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4237     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4238     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4239     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4240     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4241     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4242     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4243     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4244
4245     dspfunc(put_qpel, 0, 16);
4246     dspfunc(put_no_rnd_qpel, 0, 16);
4247
4248     dspfunc(avg_qpel, 0, 16);
4249     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4250
4251     dspfunc(put_qpel, 1, 8);
4252     dspfunc(put_no_rnd_qpel, 1, 8);
4253
4254     dspfunc(avg_qpel, 1, 8);
4255     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4256
4257     dspfunc(put_h264_qpel, 0, 16);
4258     dspfunc(put_h264_qpel, 1, 8);
4259     dspfunc(put_h264_qpel, 2, 4);
4260     dspfunc(put_h264_qpel, 3, 2);
4261     dspfunc(avg_h264_qpel, 0, 16);
4262     dspfunc(avg_h264_qpel, 1, 8);
4263     dspfunc(avg_h264_qpel, 2, 4);
4264
4265 #undef dspfunc
4266     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4267     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4268     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4269     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4270     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4271     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4272
4273     c->draw_edges = draw_edges_c;
4274
4275 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4276     ff_mlp_init(c, avctx);
4277 #endif
4278 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4279     ff_intrax8dsp_init(c,avctx);
4280 #endif
4281 #if CONFIG_RV30_DECODER
4282     ff_rv30dsp_init(c,avctx);
4283 #endif
4284 #if CONFIG_RV40_DECODER
4285     ff_rv40dsp_init(c,avctx);
4286     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4287     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4288     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4289     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4290 #endif
4291
4292     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4293     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4294     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4295     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4296     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4297     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4298     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4299     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4300
4301 #define SET_CMP_FUNC(name) \
4302     c->name[0]= name ## 16_c;\
4303     c->name[1]= name ## 8x8_c;
4304
4305     SET_CMP_FUNC(hadamard8_diff)
4306     c->hadamard8_diff[4]= hadamard8_intra16_c;
4307     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4308     SET_CMP_FUNC(dct_sad)
4309     SET_CMP_FUNC(dct_max)
4310 #if CONFIG_GPL
4311     SET_CMP_FUNC(dct264_sad)
4312 #endif
4313     c->sad[0]= pix_abs16_c;
4314     c->sad[1]= pix_abs8_c;
4315     c->sse[0]= sse16_c;
4316     c->sse[1]= sse8_c;
4317     c->sse[2]= sse4_c;
4318     SET_CMP_FUNC(quant_psnr)
4319     SET_CMP_FUNC(rd)
4320     SET_CMP_FUNC(bit)
4321     c->vsad[0]= vsad16_c;
4322     c->vsad[4]= vsad_intra16_c;
4323     c->vsad[5]= vsad_intra8_c;
4324     c->vsse[0]= vsse16_c;
4325     c->vsse[4]= vsse_intra16_c;
4326     c->vsse[5]= vsse_intra8_c;
4327     c->nsse[0]= nsse16_c;
4328     c->nsse[1]= nsse8_c;
4329 #if CONFIG_DWT
4330     ff_dsputil_init_dwt(c);
4331 #endif
4332
4333     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4334
4335     c->add_bytes= add_bytes_c;
4336     c->add_bytes_l2= add_bytes_l2_c;
4337     c->diff_bytes= diff_bytes_c;
4338     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4339     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4340     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
4341     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4342     c->bswap_buf= bswap_buf;
4343     c->bswap16_buf = bswap16_buf;
4344 #if CONFIG_PNG_DECODER
4345     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4346 #endif
4347
4348     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4349         c->h263_h_loop_filter= h263_h_loop_filter_c;
4350         c->h263_v_loop_filter= h263_v_loop_filter_c;
4351     }
4352
4353     if (CONFIG_VP3_DECODER) {
4354         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4355         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4356         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4357     }
4358
4359     c->h261_loop_filter= h261_loop_filter_c;
4360
4361     c->try_8x8basis= try_8x8basis_c;
4362     c->add_8x8basis= add_8x8basis_c;
4363
4364 #if CONFIG_VORBIS_DECODER
4365     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4366 #endif
4367 #if CONFIG_AC3_DECODER
4368     c->ac3_downmix = ff_ac3_downmix_c;
4369 #endif
4370     c->vector_fmul = vector_fmul_c;
4371     c->vector_fmul_reverse = vector_fmul_reverse_c;
4372     c->vector_fmul_add = vector_fmul_add_c;
4373     c->vector_fmul_window = vector_fmul_window_c;
4374     c->vector_clipf = vector_clipf_c;
4375     c->scalarproduct_int16 = scalarproduct_int16_c;
4376     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4377     c->apply_window_int16 = apply_window_int16_c;
4378     c->scalarproduct_float = scalarproduct_float_c;
4379     c->butterflies_float = butterflies_float_c;
4380     c->vector_fmul_scalar = vector_fmul_scalar_c;
4381
4382     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4383     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4384
4385     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4386     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4387
4388     c->shrink[0]= av_image_copy_plane;
4389     c->shrink[1]= ff_shrink22;
4390     c->shrink[2]= ff_shrink44;
4391     c->shrink[3]= ff_shrink88;
4392
4393     c->prefetch= just_return;
4394
4395     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4396     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4397
4398     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4399     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4400     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4401     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4402     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4403     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4404     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4405     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4406     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4407
4408     for(i=0; i<64; i++){
4409         if(!c->put_2tap_qpel_pixels_tab[0][i])
4410             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4411         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4412             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4413     }
4414
4415     c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4416     c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4417     c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4418     c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4419
4420     c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4421     c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4422     c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4423     c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4424
4425     switch(c->idct_permutation_type){
4426     case FF_NO_IDCT_PERM:
4427         for(i=0; i<64; i++)
4428             c->idct_permutation[i]= i;
4429         break;
4430     case FF_LIBMPEG2_IDCT_PERM:
4431         for(i=0; i<64; i++)
4432             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4433         break;
4434     case FF_SIMPLE_IDCT_PERM:
4435         for(i=0; i<64; i++)
4436             c->idct_permutation[i]= simple_mmx_permutation[i];
4437         break;
4438     case FF_TRANSPOSE_IDCT_PERM:
4439         for(i=0; i<64; i++)
4440             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4441         break;
4442     case FF_PARTTRANS_IDCT_PERM:
4443         for(i=0; i<64; i++)
4444             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4445         break;
4446     case FF_SSE2_IDCT_PERM:
4447         for(i=0; i<64; i++)
4448             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4449         break;
4450     default:
4451         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4452     }
4453 }
4454