git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41 #include "png.h"
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  47 #define pb_7f (~0UL/255 * 0x7f)
  48 #define pb_80 (~0UL/255 * 0x80)
  49
  50 const uint8_t ff_zigzag_direct[64] = {
  51     0,   1,  8, 16,  9,  2,  3, 10,
  52     17, 24, 32, 25, 18, 11,  4,  5,
  53     12, 19, 26, 33, 40, 48, 41, 34,
  54     27, 20, 13,  6,  7, 14, 21, 28,
  55     35, 42, 49, 56, 57, 50, 43, 36,
  56     29, 22, 15, 23, 30, 37, 44, 51,
  57     58, 59, 52, 45, 38, 31, 39, 46,
  58     53, 60, 61, 54, 47, 55, 62, 63
  59 };
  60
  61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  62    specification, we interleave the fields */
  63 const uint8_t ff_zigzag248_direct[64] = {
  64      0,  8,  1,  9, 16, 24,  2, 10,
  65     17, 25, 32, 40, 48, 56, 33, 41,
  66     18, 26,  3, 11,  4, 12, 19, 27,
  67     34, 42, 49, 57, 50, 58, 35, 43,
  68     20, 28,  5, 13,  6, 14, 21, 29,
  69     36, 44, 51, 59, 52, 60, 37, 45,
  70     22, 30,  7, 15, 23, 31, 38, 46,
  71     53, 61, 54, 62, 39, 47, 55, 63,
  72 };
  73
  74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  75 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  76
  77 const uint8_t ff_alternate_horizontal_scan[64] = {
  78     0,  1,   2,  3,  8,  9, 16, 17,
  79     10, 11,  4,  5,  6,  7, 15, 14,
  80     13, 12, 19, 18, 24, 25, 32, 33,
  81     26, 27, 20, 21, 22, 23, 28, 29,
  82     30, 31, 34, 35, 40, 41, 48, 49,
  83     42, 43, 36, 37, 38, 39, 44, 45,
  84     46, 47, 50, 51, 56, 57, 58, 59,
  85     52, 53, 54, 55, 60, 61, 62, 63,
  86 };
  87
  88 const uint8_t ff_alternate_vertical_scan[64] = {
  89     0,  8,  16, 24,  1,  9,  2, 10,
  90     17, 25, 32, 40, 48, 56, 57, 49,
  91     41, 33, 26, 18,  3, 11,  4, 12,
  92     19, 27, 34, 42, 50, 58, 35, 43,
  93     51, 59, 20, 28,  5, 13,  6, 14,
  94     21, 29, 36, 44, 52, 60, 37, 45,
  95     53, 61, 22, 30,  7, 15, 23, 31,
  96     38, 46, 54, 62, 39, 47, 55, 63,
  97 };
  98
  99 /* Input permutation for the simple_idct_mmx */
 100 static const uint8_t simple_mmx_permutation[64]={
 101         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 102         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 103         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 104         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 105         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 106         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 107         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 108         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 109 };
 110
 111 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 112
 113 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 114     int i;
 115     int end;
 116
 117     st->scantable= src_scantable;
 118
 119     for(i=0; i<64; i++){
 120         int j;
 121         j = src_scantable[i];
 122         st->permutated[i] = permutation[j];
 123 #if ARCH_PPC
 124         st->inverse[j] = i;
 125 #endif
 126     }
 127
 128     end=-1;
 129     for(i=0; i<64; i++){
 130         int j;
 131         j = st->permutated[i];
 132         if(j>end) end=j;
 133         st->raster_end[i]= end;
 134     }
 135 }
 136
 137 static int pix_sum_c(uint8_t * pix, int line_size)
 138 {
 139     int s, i, j;
 140
 141     s = 0;
 142     for (i = 0; i < 16; i++) {
 143         for (j = 0; j < 16; j += 8) {
 144             s += pix[0];
 145             s += pix[1];
 146             s += pix[2];
 147             s += pix[3];
 148             s += pix[4];
 149             s += pix[5];
 150             s += pix[6];
 151             s += pix[7];
 152             pix += 8;
 153         }
 154         pix += line_size - 16;
 155     }
 156     return s;
 157 }
 158
 159 static int pix_norm1_c(uint8_t * pix, int line_size)
 160 {
 161     int s, i, j;
 162     uint32_t *sq = ff_squareTbl + 256;
 163
 164     s = 0;
 165     for (i = 0; i < 16; i++) {
 166         for (j = 0; j < 16; j += 8) {
 167 #if 0
 168             s += sq[pix[0]];
 169             s += sq[pix[1]];
 170             s += sq[pix[2]];
 171             s += sq[pix[3]];
 172             s += sq[pix[4]];
 173             s += sq[pix[5]];
 174             s += sq[pix[6]];
 175             s += sq[pix[7]];
 176 #else
 177 #if LONG_MAX > 2147483647
 178             register uint64_t x=*(uint64_t*)pix;
 179             s += sq[x&0xff];
 180             s += sq[(x>>8)&0xff];
 181             s += sq[(x>>16)&0xff];
 182             s += sq[(x>>24)&0xff];
 183             s += sq[(x>>32)&0xff];
 184             s += sq[(x>>40)&0xff];
 185             s += sq[(x>>48)&0xff];
 186             s += sq[(x>>56)&0xff];
 187 #else
 188             register uint32_t x=*(uint32_t*)pix;
 189             s += sq[x&0xff];
 190             s += sq[(x>>8)&0xff];
 191             s += sq[(x>>16)&0xff];
 192             s += sq[(x>>24)&0xff];
 193             x=*(uint32_t*)(pix+4);
 194             s += sq[x&0xff];
 195             s += sq[(x>>8)&0xff];
 196             s += sq[(x>>16)&0xff];
 197             s += sq[(x>>24)&0xff];
 198 #endif
 199 #endif
 200             pix += 8;
 201         }
 202         pix += line_size - 16;
 203     }
 204     return s;
 205 }
 206
 207 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 208     int i;
 209
 210     for(i=0; i+8<=w; i+=8){
 211         dst[i+0]= av_bswap32(src[i+0]);
 212         dst[i+1]= av_bswap32(src[i+1]);
 213         dst[i+2]= av_bswap32(src[i+2]);
 214         dst[i+3]= av_bswap32(src[i+3]);
 215         dst[i+4]= av_bswap32(src[i+4]);
 216         dst[i+5]= av_bswap32(src[i+5]);
 217         dst[i+6]= av_bswap32(src[i+6]);
 218         dst[i+7]= av_bswap32(src[i+7]);
 219     }
 220     for(;i<w; i++){
 221         dst[i+0]= av_bswap32(src[i+0]);
 222     }
 223 }
 224
 225 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 226 {
 227     while (len--)
 228         *dst++ = av_bswap16(*src++);
 229 }
 230
 231 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 232 {
 233     int s, i;
 234     uint32_t *sq = ff_squareTbl + 256;
 235
 236     s = 0;
 237     for (i = 0; i < h; i++) {
 238         s += sq[pix1[0] - pix2[0]];
 239         s += sq[pix1[1] - pix2[1]];
 240         s += sq[pix1[2] - pix2[2]];
 241         s += sq[pix1[3] - pix2[3]];
 242         pix1 += line_size;
 243         pix2 += line_size;
 244     }
 245     return s;
 246 }
 247
 248 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 249 {
 250     int s, i;
 251     uint32_t *sq = ff_squareTbl + 256;
 252
 253     s = 0;
 254     for (i = 0; i < h; i++) {
 255         s += sq[pix1[0] - pix2[0]];
 256         s += sq[pix1[1] - pix2[1]];
 257         s += sq[pix1[2] - pix2[2]];
 258         s += sq[pix1[3] - pix2[3]];
 259         s += sq[pix1[4] - pix2[4]];
 260         s += sq[pix1[5] - pix2[5]];
 261         s += sq[pix1[6] - pix2[6]];
 262         s += sq[pix1[7] - pix2[7]];
 263         pix1 += line_size;
 264         pix2 += line_size;
 265     }
 266     return s;
 267 }
 268
 269 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 270 {
 271     int s, i;
 272     uint32_t *sq = ff_squareTbl + 256;
 273
 274     s = 0;
 275     for (i = 0; i < h; i++) {
 276         s += sq[pix1[ 0] - pix2[ 0]];
 277         s += sq[pix1[ 1] - pix2[ 1]];
 278         s += sq[pix1[ 2] - pix2[ 2]];
 279         s += sq[pix1[ 3] - pix2[ 3]];
 280         s += sq[pix1[ 4] - pix2[ 4]];
 281         s += sq[pix1[ 5] - pix2[ 5]];
 282         s += sq[pix1[ 6] - pix2[ 6]];
 283         s += sq[pix1[ 7] - pix2[ 7]];
 284         s += sq[pix1[ 8] - pix2[ 8]];
 285         s += sq[pix1[ 9] - pix2[ 9]];
 286         s += sq[pix1[10] - pix2[10]];
 287         s += sq[pix1[11] - pix2[11]];
 288         s += sq[pix1[12] - pix2[12]];
 289         s += sq[pix1[13] - pix2[13]];
 290         s += sq[pix1[14] - pix2[14]];
 291         s += sq[pix1[15] - pix2[15]];
 292
 293         pix1 += line_size;
 294         pix2 += line_size;
 295     }
 296     return s;
 297 }
 298
 299 /* draw the edges of width 'w' of an image of size width, height */
 300 //FIXME check that this is ok for mpeg4 interlaced
 301 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 302 {
 303     uint8_t *ptr, *last_line;
 304     int i;
 305
 306     last_line = buf + (height - 1) * wrap;
 307     for(i=0;i<w;i++) {
 308         /* top and bottom */
 309         memcpy(buf - (i + 1) * wrap, buf, width);
 310         memcpy(last_line + (i + 1) * wrap, last_line, width);
 311     }
 312     /* left and right */
 313     ptr = buf;
 314     for(i=0;i<height;i++) {
 315         memset(ptr - w, ptr[0], w);
 316         memset(ptr + width, ptr[width-1], w);
 317         ptr += wrap;
 318     }
 319     /* corners */
 320     for(i=0;i<w;i++) {
 321         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 322         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 323         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 324         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 325     }
 326 }
 327
 328 /**
 329  * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
 330  * @param buf destination buffer
 331  * @param src source buffer
 332  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 333  * @param block_w width of block
 334  * @param block_h height of block
 335  * @param src_x x coordinate of the top left sample of the block in the source buffer
 336  * @param src_y y coordinate of the top left sample of the block in the source buffer
 337  * @param w width of the source buffer
 338  * @param h height of the source buffer
 339  */
 340 void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
 341                                     int src_x, int src_y, int w, int h){
 342     int x, y;
 343     int start_y, start_x, end_y, end_x;
 344
 345     if(src_y>= h){
 346         src+= (h-1-src_y)*linesize;
 347         src_y=h-1;
 348     }else if(src_y<=-block_h){
 349         src+= (1-block_h-src_y)*linesize;
 350         src_y=1-block_h;
 351     }
 352     if(src_x>= w){
 353         src+= (w-1-src_x);
 354         src_x=w-1;
 355     }else if(src_x<=-block_w){
 356         src+= (1-block_w-src_x);
 357         src_x=1-block_w;
 358     }
 359
 360     start_y= FFMAX(0, -src_y);
 361     start_x= FFMAX(0, -src_x);
 362     end_y= FFMIN(block_h, h-src_y);
 363     end_x= FFMIN(block_w, w-src_x);
 364     assert(start_y < end_y && block_h);
 365     assert(start_x < end_x && block_w);
 366
 367     w    = end_x - start_x;
 368     src += start_y*linesize + start_x;
 369     buf += start_x;
 370
 371     //top
 372     for(y=0; y<start_y; y++){
 373         memcpy(buf, src, w);
 374         buf += linesize;
 375     }
 376
 377     // copy existing part
 378     for(; y<end_y; y++){
 379         memcpy(buf, src, w);
 380         src += linesize;
 381         buf += linesize;
 382     }
 383
 384     //bottom
 385     src -= linesize;
 386     for(; y<block_h; y++){
 387         memcpy(buf, src, w);
 388         buf += linesize;
 389     }
 390
 391     buf -= block_h * linesize + start_x;
 392     while (block_h--){
 393        //left
 394         for(x=0; x<start_x; x++){
 395             buf[x] = buf[start_x];
 396         }
 397
 398        //right
 399         for(x=end_x; x<block_w; x++){
 400             buf[x] = buf[end_x - 1];
 401         }
 402         buf += linesize;
 403     }
 404 }
 405
 406 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 407 {
 408     int i;
 409
 410     /* read the pixels */
 411     for(i=0;i<8;i++) {
 412         block[0] = pixels[0];
 413         block[1] = pixels[1];
 414         block[2] = pixels[2];
 415         block[3] = pixels[3];
 416         block[4] = pixels[4];
 417         block[5] = pixels[5];
 418         block[6] = pixels[6];
 419         block[7] = pixels[7];
 420         pixels += line_size;
 421         block += 8;
 422     }
 423 }
 424
 425 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 426                           const uint8_t *s2, int stride){
 427     int i;
 428
 429     /* read the pixels */
 430     for(i=0;i<8;i++) {
 431         block[0] = s1[0] - s2[0];
 432         block[1] = s1[1] - s2[1];
 433         block[2] = s1[2] - s2[2];
 434         block[3] = s1[3] - s2[3];
 435         block[4] = s1[4] - s2[4];
 436         block[5] = s1[5] - s2[5];
 437         block[6] = s1[6] - s2[6];
 438         block[7] = s1[7] - s2[7];
 439         s1 += stride;
 440         s2 += stride;
 441         block += 8;
 442     }
 443 }
 444
 445
 446 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 447                              int line_size)
 448 {
 449     int i;
 450     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 451
 452     /* read the pixels */
 453     for(i=0;i<8;i++) {
 454         pixels[0] = cm[block[0]];
 455         pixels[1] = cm[block[1]];
 456         pixels[2] = cm[block[2]];
 457         pixels[3] = cm[block[3]];
 458         pixels[4] = cm[block[4]];
 459         pixels[5] = cm[block[5]];
 460         pixels[6] = cm[block[6]];
 461         pixels[7] = cm[block[7]];
 462
 463         pixels += line_size;
 464         block += 8;
 465     }
 466 }
 467
 468 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 469                                  int line_size)
 470 {
 471     int i;
 472     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 473
 474     /* read the pixels */
 475     for(i=0;i<4;i++) {
 476         pixels[0] = cm[block[0]];
 477         pixels[1] = cm[block[1]];
 478         pixels[2] = cm[block[2]];
 479         pixels[3] = cm[block[3]];
 480
 481         pixels += line_size;
 482         block += 8;
 483     }
 484 }
 485
 486 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 487                                  int line_size)
 488 {
 489     int i;
 490     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 491
 492     /* read the pixels */
 493     for(i=0;i<2;i++) {
 494         pixels[0] = cm[block[0]];
 495         pixels[1] = cm[block[1]];
 496
 497         pixels += line_size;
 498         block += 8;
 499     }
 500 }
 501
 502 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
 503                                     uint8_t *restrict pixels,
 504                                     int line_size)
 505 {
 506     int i, j;
 507
 508     for (i = 0; i < 8; i++) {
 509         for (j = 0; j < 8; j++) {
 510             if (*block < -128)
 511                 *pixels = 0;
 512             else if (*block > 127)
 513                 *pixels = 255;
 514             else
 515                 *pixels = (uint8_t)(*block + 128);
 516             block++;
 517             pixels++;
 518         }
 519         pixels += (line_size - 8);
 520     }
 521 }
 522
 523 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 524                                     int line_size)
 525 {
 526     int i;
 527
 528     /* read the pixels */
 529     for(i=0;i<8;i++) {
 530         pixels[0] = block[0];
 531         pixels[1] = block[1];
 532         pixels[2] = block[2];
 533         pixels[3] = block[3];
 534         pixels[4] = block[4];
 535         pixels[5] = block[5];
 536         pixels[6] = block[6];
 537         pixels[7] = block[7];
 538
 539         pixels += line_size;
 540         block += 8;
 541     }
 542 }
 543
 544 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 545                              int line_size)
 546 {
 547     int i;
 548     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 549
 550     /* read the pixels */
 551     for(i=0;i<8;i++) {
 552         pixels[0] = cm[pixels[0] + block[0]];
 553         pixels[1] = cm[pixels[1] + block[1]];
 554         pixels[2] = cm[pixels[2] + block[2]];
 555         pixels[3] = cm[pixels[3] + block[3]];
 556         pixels[4] = cm[pixels[4] + block[4]];
 557         pixels[5] = cm[pixels[5] + block[5]];
 558         pixels[6] = cm[pixels[6] + block[6]];
 559         pixels[7] = cm[pixels[7] + block[7]];
 560         pixels += line_size;
 561         block += 8;
 562     }
 563 }
 564
 565 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 566                           int line_size)
 567 {
 568     int i;
 569     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 570
 571     /* read the pixels */
 572     for(i=0;i<4;i++) {
 573         pixels[0] = cm[pixels[0] + block[0]];
 574         pixels[1] = cm[pixels[1] + block[1]];
 575         pixels[2] = cm[pixels[2] + block[2]];
 576         pixels[3] = cm[pixels[3] + block[3]];
 577         pixels += line_size;
 578         block += 8;
 579     }
 580 }
 581
 582 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 583                           int line_size)
 584 {
 585     int i;
 586     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 587
 588     /* read the pixels */
 589     for(i=0;i<2;i++) {
 590         pixels[0] = cm[pixels[0] + block[0]];
 591         pixels[1] = cm[pixels[1] + block[1]];
 592         pixels += line_size;
 593         block += 8;
 594     }
 595 }
 596
 597 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 598 {
 599     int i;
 600     for(i=0;i<8;i++) {
 601         pixels[0] += block[0];
 602         pixels[1] += block[1];
 603         pixels[2] += block[2];
 604         pixels[3] += block[3];
 605         pixels[4] += block[4];
 606         pixels[5] += block[5];
 607         pixels[6] += block[6];
 608         pixels[7] += block[7];
 609         pixels += line_size;
 610         block += 8;
 611     }
 612 }
 613
 614 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 615 {
 616     int i;
 617     for(i=0;i<4;i++) {
 618         pixels[0] += block[0];
 619         pixels[1] += block[1];
 620         pixels[2] += block[2];
 621         pixels[3] += block[3];
 622         pixels += line_size;
 623         block += 4;
 624     }
 625 }
 626
 627 static int sum_abs_dctelem_c(DCTELEM *block)
 628 {
 629     int sum=0, i;
 630     for(i=0; i<64; i++)
 631         sum+= FFABS(block[i]);
 632     return sum;
 633 }
 634
 635 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 636 {
 637     int i;
 638
 639     for (i = 0; i < h; i++) {
 640         memset(block, value, 16);
 641         block += line_size;
 642     }
 643 }
 644
 645 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 646 {
 647     int i;
 648
 649     for (i = 0; i < h; i++) {
 650         memset(block, value, 8);
 651         block += line_size;
 652     }
 653 }
 654
 655 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
 656 {
 657     int i, j;
 658     uint16_t *dst1 = (uint16_t *) dst;
 659     uint16_t *dst2 = (uint16_t *)(dst + linesize);
 660
 661     for (j = 0; j < 8; j++) {
 662         for (i = 0; i < 8; i++) {
 663             dst1[i] = dst2[i] = src[i] * 0x0101;
 664         }
 665         src  += 8;
 666         dst1 += linesize;
 667         dst2 += linesize;
 668     }
 669 }
 670
 671 #if 0
 672
 673 #define PIXOP2(OPNAME, OP) \
 674 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 675 {\
 676     int i;\
 677     for(i=0; i<h; i++){\
 678         OP(*((uint64_t*)block), AV_RN64(pixels));\
 679         pixels+=line_size;\
 680         block +=line_size;\
 681     }\
 682 }\
 683 \
 684 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 685 {\
 686     int i;\
 687     for(i=0; i<h; i++){\
 688         const uint64_t a= AV_RN64(pixels  );\
 689         const uint64_t b= AV_RN64(pixels+1);\
 690         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 691         pixels+=line_size;\
 692         block +=line_size;\
 693     }\
 694 }\
 695 \
 696 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 697 {\
 698     int i;\
 699     for(i=0; i<h; i++){\
 700         const uint64_t a= AV_RN64(pixels  );\
 701         const uint64_t b= AV_RN64(pixels+1);\
 702         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 703         pixels+=line_size;\
 704         block +=line_size;\
 705     }\
 706 }\
 707 \
 708 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 709 {\
 710     int i;\
 711     for(i=0; i<h; i++){\
 712         const uint64_t a= AV_RN64(pixels          );\
 713         const uint64_t b= AV_RN64(pixels+line_size);\
 714         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 715         pixels+=line_size;\
 716         block +=line_size;\
 717     }\
 718 }\
 719 \
 720 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 721 {\
 722     int i;\
 723     for(i=0; i<h; i++){\
 724         const uint64_t a= AV_RN64(pixels          );\
 725         const uint64_t b= AV_RN64(pixels+line_size);\
 726         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 727         pixels+=line_size;\
 728         block +=line_size;\
 729     }\
 730 }\
 731 \
 732 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 733 {\
 734         int i;\
 735         const uint64_t a= AV_RN64(pixels  );\
 736         const uint64_t b= AV_RN64(pixels+1);\
 737         uint64_t l0=  (a&0x0303030303030303ULL)\
 738                     + (b&0x0303030303030303ULL)\
 739                     + 0x0202020202020202ULL;\
 740         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 741                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 742         uint64_t l1,h1;\
 743 \
 744         pixels+=line_size;\
 745         for(i=0; i<h; i+=2){\
 746             uint64_t a= AV_RN64(pixels  );\
 747             uint64_t b= AV_RN64(pixels+1);\
 748             l1=  (a&0x0303030303030303ULL)\
 749                + (b&0x0303030303030303ULL);\
 750             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 751               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 752             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 753             pixels+=line_size;\
 754             block +=line_size;\
 755             a= AV_RN64(pixels  );\
 756             b= AV_RN64(pixels+1);\
 757             l0=  (a&0x0303030303030303ULL)\
 758                + (b&0x0303030303030303ULL)\
 759                + 0x0202020202020202ULL;\
 760             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 761               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 762             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 763             pixels+=line_size;\
 764             block +=line_size;\
 765         }\
 766 }\
 767 \
 768 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 769 {\
 770         int i;\
 771         const uint64_t a= AV_RN64(pixels  );\
 772         const uint64_t b= AV_RN64(pixels+1);\
 773         uint64_t l0=  (a&0x0303030303030303ULL)\
 774                     + (b&0x0303030303030303ULL)\
 775                     + 0x0101010101010101ULL;\
 776         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 777                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 778         uint64_t l1,h1;\
 779 \
 780         pixels+=line_size;\
 781         for(i=0; i<h; i+=2){\
 782             uint64_t a= AV_RN64(pixels  );\
 783             uint64_t b= AV_RN64(pixels+1);\
 784             l1=  (a&0x0303030303030303ULL)\
 785                + (b&0x0303030303030303ULL);\
 786             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 787               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 788             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 789             pixels+=line_size;\
 790             block +=line_size;\
 791             a= AV_RN64(pixels  );\
 792             b= AV_RN64(pixels+1);\
 793             l0=  (a&0x0303030303030303ULL)\
 794                + (b&0x0303030303030303ULL)\
 795                + 0x0101010101010101ULL;\
 796             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 797               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 798             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 799             pixels+=line_size;\
 800             block +=line_size;\
 801         }\
 802 }\
 803 \
 804 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 805 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 806 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 807 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 808 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 809 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 810 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 811
 812 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 813 #else // 64 bit variant
 814
 815 #define PIXOP2(OPNAME, OP) \
 816 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 817     int i;\
 818     for(i=0; i<h; i++){\
 819         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 820         pixels+=line_size;\
 821         block +=line_size;\
 822     }\
 823 }\
 824 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 825     int i;\
 826     for(i=0; i<h; i++){\
 827         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 828         pixels+=line_size;\
 829         block +=line_size;\
 830     }\
 831 }\
 832 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 833     int i;\
 834     for(i=0; i<h; i++){\
 835         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 836         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 837         pixels+=line_size;\
 838         block +=line_size;\
 839     }\
 840 }\
 841 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 842     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 843 }\
 844 \
 845 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 846                                                 int src_stride1, int src_stride2, int h){\
 847     int i;\
 848     for(i=0; i<h; i++){\
 849         uint32_t a,b;\
 850         a= AV_RN32(&src1[i*src_stride1  ]);\
 851         b= AV_RN32(&src2[i*src_stride2  ]);\
 852         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 853         a= AV_RN32(&src1[i*src_stride1+4]);\
 854         b= AV_RN32(&src2[i*src_stride2+4]);\
 855         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 856     }\
 857 }\
 858 \
 859 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 860                                                 int src_stride1, int src_stride2, int h){\
 861     int i;\
 862     for(i=0; i<h; i++){\
 863         uint32_t a,b;\
 864         a= AV_RN32(&src1[i*src_stride1  ]);\
 865         b= AV_RN32(&src2[i*src_stride2  ]);\
 866         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 867         a= AV_RN32(&src1[i*src_stride1+4]);\
 868         b= AV_RN32(&src2[i*src_stride2+4]);\
 869         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 870     }\
 871 }\
 872 \
 873 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 874                                                 int src_stride1, int src_stride2, int h){\
 875     int i;\
 876     for(i=0; i<h; i++){\
 877         uint32_t a,b;\
 878         a= AV_RN32(&src1[i*src_stride1  ]);\
 879         b= AV_RN32(&src2[i*src_stride2  ]);\
 880         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 881     }\
 882 }\
 883 \
 884 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 885                                                 int src_stride1, int src_stride2, int h){\
 886     int i;\
 887     for(i=0; i<h; i++){\
 888         uint32_t a,b;\
 889         a= AV_RN16(&src1[i*src_stride1  ]);\
 890         b= AV_RN16(&src2[i*src_stride2  ]);\
 891         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 892     }\
 893 }\
 894 \
 895 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 896                                                 int src_stride1, int src_stride2, int h){\
 897     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 898     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 899 }\
 900 \
 901 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 902                                                 int src_stride1, int src_stride2, int h){\
 903     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 904     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 905 }\
 906 \
 907 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 908     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 909 }\
 910 \
 911 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 912     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 913 }\
 914 \
 915 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 916     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 917 }\
 918 \
 919 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 920     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 921 }\
 922 \
 923 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 924                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 925     int i;\
 926     for(i=0; i<h; i++){\
 927         uint32_t a, b, c, d, l0, l1, h0, h1;\
 928         a= AV_RN32(&src1[i*src_stride1]);\
 929         b= AV_RN32(&src2[i*src_stride2]);\
 930         c= AV_RN32(&src3[i*src_stride3]);\
 931         d= AV_RN32(&src4[i*src_stride4]);\
 932         l0=  (a&0x03030303UL)\
 933            + (b&0x03030303UL)\
 934            + 0x02020202UL;\
 935         h0= ((a&0xFCFCFCFCUL)>>2)\
 936           + ((b&0xFCFCFCFCUL)>>2);\
 937         l1=  (c&0x03030303UL)\
 938            + (d&0x03030303UL);\
 939         h1= ((c&0xFCFCFCFCUL)>>2)\
 940           + ((d&0xFCFCFCFCUL)>>2);\
 941         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 942         a= AV_RN32(&src1[i*src_stride1+4]);\
 943         b= AV_RN32(&src2[i*src_stride2+4]);\
 944         c= AV_RN32(&src3[i*src_stride3+4]);\
 945         d= AV_RN32(&src4[i*src_stride4+4]);\
 946         l0=  (a&0x03030303UL)\
 947            + (b&0x03030303UL)\
 948            + 0x02020202UL;\
 949         h0= ((a&0xFCFCFCFCUL)>>2)\
 950           + ((b&0xFCFCFCFCUL)>>2);\
 951         l1=  (c&0x03030303UL)\
 952            + (d&0x03030303UL);\
 953         h1= ((c&0xFCFCFCFCUL)>>2)\
 954           + ((d&0xFCFCFCFCUL)>>2);\
 955         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 956     }\
 957 }\
 958 \
 959 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 960     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 961 }\
 962 \
 963 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 964     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 965 }\
 966 \
 967 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 968     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 969 }\
 970 \
 971 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 972     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 973 }\
 974 \
 975 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 976                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 977     int i;\
 978     for(i=0; i<h; i++){\
 979         uint32_t a, b, c, d, l0, l1, h0, h1;\
 980         a= AV_RN32(&src1[i*src_stride1]);\
 981         b= AV_RN32(&src2[i*src_stride2]);\
 982         c= AV_RN32(&src3[i*src_stride3]);\
 983         d= AV_RN32(&src4[i*src_stride4]);\
 984         l0=  (a&0x03030303UL)\
 985            + (b&0x03030303UL)\
 986            + 0x01010101UL;\
 987         h0= ((a&0xFCFCFCFCUL)>>2)\
 988           + ((b&0xFCFCFCFCUL)>>2);\
 989         l1=  (c&0x03030303UL)\
 990            + (d&0x03030303UL);\
 991         h1= ((c&0xFCFCFCFCUL)>>2)\
 992           + ((d&0xFCFCFCFCUL)>>2);\
 993         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 994         a= AV_RN32(&src1[i*src_stride1+4]);\
 995         b= AV_RN32(&src2[i*src_stride2+4]);\
 996         c= AV_RN32(&src3[i*src_stride3+4]);\
 997         d= AV_RN32(&src4[i*src_stride4+4]);\
 998         l0=  (a&0x03030303UL)\
 999            + (b&0x03030303UL)\
1000            + 0x01010101UL;\
1001         h0= ((a&0xFCFCFCFCUL)>>2)\
1002           + ((b&0xFCFCFCFCUL)>>2);\
1003         l1=  (c&0x03030303UL)\
1004            + (d&0x03030303UL);\
1005         h1= ((c&0xFCFCFCFCUL)>>2)\
1006           + ((d&0xFCFCFCFCUL)>>2);\
1007         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1008     }\
1009 }\
1010 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1011                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1012     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1013     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1014 }\
1015 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1016                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1017     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1018     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1019 }\
1020 \
1021 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1022 {\
1023         int i, a0, b0, a1, b1;\
1024         a0= pixels[0];\
1025         b0= pixels[1] + 2;\
1026         a0 += b0;\
1027         b0 += pixels[2];\
1028 \
1029         pixels+=line_size;\
1030         for(i=0; i<h; i+=2){\
1031             a1= pixels[0];\
1032             b1= pixels[1];\
1033             a1 += b1;\
1034             b1 += pixels[2];\
1035 \
1036             block[0]= (a1+a0)>>2; /* FIXME non put */\
1037             block[1]= (b1+b0)>>2;\
1038 \
1039             pixels+=line_size;\
1040             block +=line_size;\
1041 \
1042             a0= pixels[0];\
1043             b0= pixels[1] + 2;\
1044             a0 += b0;\
1045             b0 += pixels[2];\
1046 \
1047             block[0]= (a1+a0)>>2;\
1048             block[1]= (b1+b0)>>2;\
1049             pixels+=line_size;\
1050             block +=line_size;\
1051         }\
1052 }\
1053 \
1054 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1055 {\
1056         int i;\
1057         const uint32_t a= AV_RN32(pixels  );\
1058         const uint32_t b= AV_RN32(pixels+1);\
1059         uint32_t l0=  (a&0x03030303UL)\
1060                     + (b&0x03030303UL)\
1061                     + 0x02020202UL;\
1062         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1063                    + ((b&0xFCFCFCFCUL)>>2);\
1064         uint32_t l1,h1;\
1065 \
1066         pixels+=line_size;\
1067         for(i=0; i<h; i+=2){\
1068             uint32_t a= AV_RN32(pixels  );\
1069             uint32_t b= AV_RN32(pixels+1);\
1070             l1=  (a&0x03030303UL)\
1071                + (b&0x03030303UL);\
1072             h1= ((a&0xFCFCFCFCUL)>>2)\
1073               + ((b&0xFCFCFCFCUL)>>2);\
1074             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1075             pixels+=line_size;\
1076             block +=line_size;\
1077             a= AV_RN32(pixels  );\
1078             b= AV_RN32(pixels+1);\
1079             l0=  (a&0x03030303UL)\
1080                + (b&0x03030303UL)\
1081                + 0x02020202UL;\
1082             h0= ((a&0xFCFCFCFCUL)>>2)\
1083               + ((b&0xFCFCFCFCUL)>>2);\
1084             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1085             pixels+=line_size;\
1086             block +=line_size;\
1087         }\
1088 }\
1089 \
1090 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1091 {\
1092     int j;\
1093     for(j=0; j<2; j++){\
1094         int i;\
1095         const uint32_t a= AV_RN32(pixels  );\
1096         const uint32_t b= AV_RN32(pixels+1);\
1097         uint32_t l0=  (a&0x03030303UL)\
1098                     + (b&0x03030303UL)\
1099                     + 0x02020202UL;\
1100         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1101                    + ((b&0xFCFCFCFCUL)>>2);\
1102         uint32_t l1,h1;\
1103 \
1104         pixels+=line_size;\
1105         for(i=0; i<h; i+=2){\
1106             uint32_t a= AV_RN32(pixels  );\
1107             uint32_t b= AV_RN32(pixels+1);\
1108             l1=  (a&0x03030303UL)\
1109                + (b&0x03030303UL);\
1110             h1= ((a&0xFCFCFCFCUL)>>2)\
1111               + ((b&0xFCFCFCFCUL)>>2);\
1112             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1113             pixels+=line_size;\
1114             block +=line_size;\
1115             a= AV_RN32(pixels  );\
1116             b= AV_RN32(pixels+1);\
1117             l0=  (a&0x03030303UL)\
1118                + (b&0x03030303UL)\
1119                + 0x02020202UL;\
1120             h0= ((a&0xFCFCFCFCUL)>>2)\
1121               + ((b&0xFCFCFCFCUL)>>2);\
1122             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1123             pixels+=line_size;\
1124             block +=line_size;\
1125         }\
1126         pixels+=4-line_size*(h+1);\
1127         block +=4-line_size*h;\
1128     }\
1129 }\
1130 \
1131 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1132 {\
1133     int j;\
1134     for(j=0; j<2; j++){\
1135         int i;\
1136         const uint32_t a= AV_RN32(pixels  );\
1137         const uint32_t b= AV_RN32(pixels+1);\
1138         uint32_t l0=  (a&0x03030303UL)\
1139                     + (b&0x03030303UL)\
1140                     + 0x01010101UL;\
1141         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1142                    + ((b&0xFCFCFCFCUL)>>2);\
1143         uint32_t l1,h1;\
1144 \
1145         pixels+=line_size;\
1146         for(i=0; i<h; i+=2){\
1147             uint32_t a= AV_RN32(pixels  );\
1148             uint32_t b= AV_RN32(pixels+1);\
1149             l1=  (a&0x03030303UL)\
1150                + (b&0x03030303UL);\
1151             h1= ((a&0xFCFCFCFCUL)>>2)\
1152               + ((b&0xFCFCFCFCUL)>>2);\
1153             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1154             pixels+=line_size;\
1155             block +=line_size;\
1156             a= AV_RN32(pixels  );\
1157             b= AV_RN32(pixels+1);\
1158             l0=  (a&0x03030303UL)\
1159                + (b&0x03030303UL)\
1160                + 0x01010101UL;\
1161             h0= ((a&0xFCFCFCFCUL)>>2)\
1162               + ((b&0xFCFCFCFCUL)>>2);\
1163             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1164             pixels+=line_size;\
1165             block +=line_size;\
1166         }\
1167         pixels+=4-line_size*(h+1);\
1168         block +=4-line_size*h;\
1169     }\
1170 }\
1171 \
1172 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1173 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1174 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1175 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1176 av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1177 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1178 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1179 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1180
1181 #define op_avg(a, b) a = rnd_avg32(a, b)
1182 #endif
1183 #define op_put(a, b) a = b
1184
1185 PIXOP2(avg, op_avg)
1186 PIXOP2(put, op_put)
1187 #undef op_avg
1188 #undef op_put
1189
1190 #define put_no_rnd_pixels8_c  put_pixels8_c
1191 #define put_no_rnd_pixels16_c put_pixels16_c
1192
1193 #define avg2(a,b) ((a+b+1)>>1)
1194 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1195
1196 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1197     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1198 }
1199
1200 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1201     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1202 }
1203
1204 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1205 {
1206     const int A=(16-x16)*(16-y16);
1207     const int B=(   x16)*(16-y16);
1208     const int C=(16-x16)*(   y16);
1209     const int D=(   x16)*(   y16);
1210     int i;
1211
1212     for(i=0; i<h; i++)
1213     {
1214         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1215         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1216         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1217         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1218         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1219         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1220         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1221         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1222         dst+= stride;
1223         src+= stride;
1224     }
1225 }
1226
1227 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1228                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1229 {
1230     int y, vx, vy;
1231     const int s= 1<<shift;
1232
1233     width--;
1234     height--;
1235
1236     for(y=0; y<h; y++){
1237         int x;
1238
1239         vx= ox;
1240         vy= oy;
1241         for(x=0; x<8; x++){ //XXX FIXME optimize
1242             int src_x, src_y, frac_x, frac_y, index;
1243
1244             src_x= vx>>16;
1245             src_y= vy>>16;
1246             frac_x= src_x&(s-1);
1247             frac_y= src_y&(s-1);
1248             src_x>>=shift;
1249             src_y>>=shift;
1250
1251             if((unsigned)src_x < width){
1252                 if((unsigned)src_y < height){
1253                     index= src_x + src_y*stride;
1254                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1255                                            + src[index       +1]*   frac_x )*(s-frac_y)
1256                                         + (  src[index+stride  ]*(s-frac_x)
1257                                            + src[index+stride+1]*   frac_x )*   frac_y
1258                                         + r)>>(shift*2);
1259                 }else{
1260                     index= src_x + av_clip(src_y, 0, height)*stride;
1261                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1262                                           + src[index       +1]*   frac_x )*s
1263                                         + r)>>(shift*2);
1264                 }
1265             }else{
1266                 if((unsigned)src_y < height){
1267                     index= av_clip(src_x, 0, width) + src_y*stride;
1268                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1269                                            + src[index+stride  ]*   frac_y )*s
1270                                         + r)>>(shift*2);
1271                 }else{
1272                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1273                     dst[y*stride + x]=    src[index         ];
1274                 }
1275             }
1276
1277             vx+= dxx;
1278             vy+= dyx;
1279         }
1280         ox += dxy;
1281         oy += dyy;
1282     }
1283 }
1284
1285 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1286     switch(width){
1287     case 2: put_pixels2_c (dst, src, stride, height); break;
1288     case 4: put_pixels4_c (dst, src, stride, height); break;
1289     case 8: put_pixels8_c (dst, src, stride, height); break;
1290     case 16:put_pixels16_c(dst, src, stride, height); break;
1291     }
1292 }
1293
1294 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1295     int i,j;
1296     for (i=0; i < height; i++) {
1297       for (j=0; j < width; j++) {
1298         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1299       }
1300       src += stride;
1301       dst += stride;
1302     }
1303 }
1304
1305 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1306     int i,j;
1307     for (i=0; i < height; i++) {
1308       for (j=0; j < width; j++) {
1309         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1310       }
1311       src += stride;
1312       dst += stride;
1313     }
1314 }
1315
1316 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1317     int i,j;
1318     for (i=0; i < height; i++) {
1319       for (j=0; j < width; j++) {
1320         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1321       }
1322       src += stride;
1323       dst += stride;
1324     }
1325 }
1326
1327 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1328     int i,j;
1329     for (i=0; i < height; i++) {
1330       for (j=0; j < width; j++) {
1331         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1332       }
1333       src += stride;
1334       dst += stride;
1335     }
1336 }
1337
1338 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1339     int i,j;
1340     for (i=0; i < height; i++) {
1341       for (j=0; j < width; j++) {
1342         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1343       }
1344       src += stride;
1345       dst += stride;
1346     }
1347 }
1348
1349 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1350     int i,j;
1351     for (i=0; i < height; i++) {
1352       for (j=0; j < width; j++) {
1353         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1354       }
1355       src += stride;
1356       dst += stride;
1357     }
1358 }
1359
1360 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1361     int i,j;
1362     for (i=0; i < height; i++) {
1363       for (j=0; j < width; j++) {
1364         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1365       }
1366       src += stride;
1367       dst += stride;
1368     }
1369 }
1370
1371 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1372     int i,j;
1373     for (i=0; i < height; i++) {
1374       for (j=0; j < width; j++) {
1375         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1376       }
1377       src += stride;
1378       dst += stride;
1379     }
1380 }
1381
1382 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1383     switch(width){
1384     case 2: avg_pixels2_c (dst, src, stride, height); break;
1385     case 4: avg_pixels4_c (dst, src, stride, height); break;
1386     case 8: avg_pixels8_c (dst, src, stride, height); break;
1387     case 16:avg_pixels16_c(dst, src, stride, height); break;
1388     }
1389 }
1390
1391 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1392     int i,j;
1393     for (i=0; i < height; i++) {
1394       for (j=0; j < width; j++) {
1395         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1396       }
1397       src += stride;
1398       dst += stride;
1399     }
1400 }
1401
1402 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1403     int i,j;
1404     for (i=0; i < height; i++) {
1405       for (j=0; j < width; j++) {
1406         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1407       }
1408       src += stride;
1409       dst += stride;
1410     }
1411 }
1412
1413 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1414     int i,j;
1415     for (i=0; i < height; i++) {
1416       for (j=0; j < width; j++) {
1417         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1418       }
1419       src += stride;
1420       dst += stride;
1421     }
1422 }
1423
1424 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1425     int i,j;
1426     for (i=0; i < height; i++) {
1427       for (j=0; j < width; j++) {
1428         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1429       }
1430       src += stride;
1431       dst += stride;
1432     }
1433 }
1434
1435 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1436     int i,j;
1437     for (i=0; i < height; i++) {
1438       for (j=0; j < width; j++) {
1439         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1440       }
1441       src += stride;
1442       dst += stride;
1443     }
1444 }
1445
1446 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1447     int i,j;
1448     for (i=0; i < height; i++) {
1449       for (j=0; j < width; j++) {
1450         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1451       }
1452       src += stride;
1453       dst += stride;
1454     }
1455 }
1456
1457 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1458     int i,j;
1459     for (i=0; i < height; i++) {
1460       for (j=0; j < width; j++) {
1461         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1462       }
1463       src += stride;
1464       dst += stride;
1465     }
1466 }
1467
1468 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1469     int i,j;
1470     for (i=0; i < height; i++) {
1471       for (j=0; j < width; j++) {
1472         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1473       }
1474       src += stride;
1475       dst += stride;
1476     }
1477 }
1478 #if 0
1479 #define TPEL_WIDTH(width)\
1480 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1481     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1482 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1483     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1484 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1485     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1486 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1487     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1488 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1489     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1490 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1491     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1492 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1493     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1494 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1495     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1496 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1497     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1498 #endif
1499
1500 #define H264_CHROMA_MC(OPNAME, OP)\
1501 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1502     const int A=(8-x)*(8-y);\
1503     const int B=(  x)*(8-y);\
1504     const int C=(8-x)*(  y);\
1505     const int D=(  x)*(  y);\
1506     int i;\
1507     \
1508     assert(x<8 && y<8 && x>=0 && y>=0);\
1509 \
1510     if(D){\
1511         for(i=0; i<h; i++){\
1512             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1513             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1514             dst+= stride;\
1515             src+= stride;\
1516         }\
1517     }else{\
1518         const int E= B+C;\
1519         const int step= C ? stride : 1;\
1520         for(i=0; i<h; i++){\
1521             OP(dst[0], (A*src[0] + E*src[step+0]));\
1522             OP(dst[1], (A*src[1] + E*src[step+1]));\
1523             dst+= stride;\
1524             src+= stride;\
1525         }\
1526     }\
1527 }\
1528 \
1529 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1530     const int A=(8-x)*(8-y);\
1531     const int B=(  x)*(8-y);\
1532     const int C=(8-x)*(  y);\
1533     const int D=(  x)*(  y);\
1534     int i;\
1535     \
1536     assert(x<8 && y<8 && x>=0 && y>=0);\
1537 \
1538     if(D){\
1539         for(i=0; i<h; i++){\
1540             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1541             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1542             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1543             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1544             dst+= stride;\
1545             src+= stride;\
1546         }\
1547     }else{\
1548         const int E= B+C;\
1549         const int step= C ? stride : 1;\
1550         for(i=0; i<h; i++){\
1551             OP(dst[0], (A*src[0] + E*src[step+0]));\
1552             OP(dst[1], (A*src[1] + E*src[step+1]));\
1553             OP(dst[2], (A*src[2] + E*src[step+2]));\
1554             OP(dst[3], (A*src[3] + E*src[step+3]));\
1555             dst+= stride;\
1556             src+= stride;\
1557         }\
1558     }\
1559 }\
1560 \
1561 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1562     const int A=(8-x)*(8-y);\
1563     const int B=(  x)*(8-y);\
1564     const int C=(8-x)*(  y);\
1565     const int D=(  x)*(  y);\
1566     int i;\
1567     \
1568     assert(x<8 && y<8 && x>=0 && y>=0);\
1569 \
1570     if(D){\
1571         for(i=0; i<h; i++){\
1572             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1573             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1574             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1575             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1576             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1577             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1578             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1579             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1580             dst+= stride;\
1581             src+= stride;\
1582         }\
1583     }else{\
1584         const int E= B+C;\
1585         const int step= C ? stride : 1;\
1586         for(i=0; i<h; i++){\
1587             OP(dst[0], (A*src[0] + E*src[step+0]));\
1588             OP(dst[1], (A*src[1] + E*src[step+1]));\
1589             OP(dst[2], (A*src[2] + E*src[step+2]));\
1590             OP(dst[3], (A*src[3] + E*src[step+3]));\
1591             OP(dst[4], (A*src[4] + E*src[step+4]));\
1592             OP(dst[5], (A*src[5] + E*src[step+5]));\
1593             OP(dst[6], (A*src[6] + E*src[step+6]));\
1594             OP(dst[7], (A*src[7] + E*src[step+7]));\
1595             dst+= stride;\
1596             src+= stride;\
1597         }\
1598     }\
1599 }
1600
1601 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1602 #define op_put(a, b) a = (((b) + 32)>>6)
1603
1604 H264_CHROMA_MC(put_       , op_put)
1605 H264_CHROMA_MC(avg_       , op_avg)
1606 #undef op_avg
1607 #undef op_put
1608
1609 #define QPEL_MC(r, OPNAME, RND, OP) \
1610 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1611     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1612     int i;\
1613     for(i=0; i<h; i++)\
1614     {\
1615         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1616         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1617         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1618         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1619         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1620         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1621         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1622         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1623         dst+=dstStride;\
1624         src+=srcStride;\
1625     }\
1626 }\
1627 \
1628 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1629     const int w=8;\
1630     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1631     int i;\
1632     for(i=0; i<w; i++)\
1633     {\
1634         const int src0= src[0*srcStride];\
1635         const int src1= src[1*srcStride];\
1636         const int src2= src[2*srcStride];\
1637         const int src3= src[3*srcStride];\
1638         const int src4= src[4*srcStride];\
1639         const int src5= src[5*srcStride];\
1640         const int src6= src[6*srcStride];\
1641         const int src7= src[7*srcStride];\
1642         const int src8= src[8*srcStride];\
1643         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1644         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1645         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1646         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1647         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1648         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1649         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1650         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1651         dst++;\
1652         src++;\
1653     }\
1654 }\
1655 \
1656 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1657     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1658     int i;\
1659     \
1660     for(i=0; i<h; i++)\
1661     {\
1662         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1663         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1664         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1665         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1666         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1667         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1668         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1669         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1670         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1671         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1672         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1673         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1674         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1675         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1676         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1677         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1678         dst+=dstStride;\
1679         src+=srcStride;\
1680     }\
1681 }\
1682 \
1683 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1684     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1685     int i;\
1686     const int w=16;\
1687     for(i=0; i<w; i++)\
1688     {\
1689         const int src0= src[0*srcStride];\
1690         const int src1= src[1*srcStride];\
1691         const int src2= src[2*srcStride];\
1692         const int src3= src[3*srcStride];\
1693         const int src4= src[4*srcStride];\
1694         const int src5= src[5*srcStride];\
1695         const int src6= src[6*srcStride];\
1696         const int src7= src[7*srcStride];\
1697         const int src8= src[8*srcStride];\
1698         const int src9= src[9*srcStride];\
1699         const int src10= src[10*srcStride];\
1700         const int src11= src[11*srcStride];\
1701         const int src12= src[12*srcStride];\
1702         const int src13= src[13*srcStride];\
1703         const int src14= src[14*srcStride];\
1704         const int src15= src[15*srcStride];\
1705         const int src16= src[16*srcStride];\
1706         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1707         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1708         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1709         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1710         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1711         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1712         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1713         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1714         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1715         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1716         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1717         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1718         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1719         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1720         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1721         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1722         dst++;\
1723         src++;\
1724     }\
1725 }\
1726 \
1727 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1728     uint8_t half[64];\
1729     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1730     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1731 }\
1732 \
1733 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1734     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1735 }\
1736 \
1737 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1738     uint8_t half[64];\
1739     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1740     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1741 }\
1742 \
1743 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1744     uint8_t full[16*9];\
1745     uint8_t half[64];\
1746     copy_block9(full, src, 16, stride, 9);\
1747     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1748     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1749 }\
1750 \
1751 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1752     uint8_t full[16*9];\
1753     copy_block9(full, src, 16, stride, 9);\
1754     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1755 }\
1756 \
1757 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1758     uint8_t full[16*9];\
1759     uint8_t half[64];\
1760     copy_block9(full, src, 16, stride, 9);\
1761     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1762     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1763 }\
1764 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1765     uint8_t full[16*9];\
1766     uint8_t halfH[72];\
1767     uint8_t halfV[64];\
1768     uint8_t halfHV[64];\
1769     copy_block9(full, src, 16, stride, 9);\
1770     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1771     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1772     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1773     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1774 }\
1775 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1776     uint8_t full[16*9];\
1777     uint8_t halfH[72];\
1778     uint8_t halfHV[64];\
1779     copy_block9(full, src, 16, stride, 9);\
1780     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1781     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1782     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1783     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1784 }\
1785 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1786     uint8_t full[16*9];\
1787     uint8_t halfH[72];\
1788     uint8_t halfV[64];\
1789     uint8_t halfHV[64];\
1790     copy_block9(full, src, 16, stride, 9);\
1791     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1792     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1793     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1794     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1795 }\
1796 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1797     uint8_t full[16*9];\
1798     uint8_t halfH[72];\
1799     uint8_t halfHV[64];\
1800     copy_block9(full, src, 16, stride, 9);\
1801     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1802     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1803     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1804     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1805 }\
1806 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1807     uint8_t full[16*9];\
1808     uint8_t halfH[72];\
1809     uint8_t halfV[64];\
1810     uint8_t halfHV[64];\
1811     copy_block9(full, src, 16, stride, 9);\
1812     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1813     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1814     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1815     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1816 }\
1817 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1818     uint8_t full[16*9];\
1819     uint8_t halfH[72];\
1820     uint8_t halfHV[64];\
1821     copy_block9(full, src, 16, stride, 9);\
1822     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1823     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1824     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1825     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1826 }\
1827 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1828     uint8_t full[16*9];\
1829     uint8_t halfH[72];\
1830     uint8_t halfV[64];\
1831     uint8_t halfHV[64];\
1832     copy_block9(full, src, 16, stride, 9);\
1833     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1834     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1835     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1836     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1837 }\
1838 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1839     uint8_t full[16*9];\
1840     uint8_t halfH[72];\
1841     uint8_t halfHV[64];\
1842     copy_block9(full, src, 16, stride, 9);\
1843     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1844     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1845     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1846     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1847 }\
1848 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1849     uint8_t halfH[72];\
1850     uint8_t halfHV[64];\
1851     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1852     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1853     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1854 }\
1855 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1856     uint8_t halfH[72];\
1857     uint8_t halfHV[64];\
1858     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1859     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1860     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1861 }\
1862 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1863     uint8_t full[16*9];\
1864     uint8_t halfH[72];\
1865     uint8_t halfV[64];\
1866     uint8_t halfHV[64];\
1867     copy_block9(full, src, 16, stride, 9);\
1868     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1869     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1870     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1871     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1872 }\
1873 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1874     uint8_t full[16*9];\
1875     uint8_t halfH[72];\
1876     copy_block9(full, src, 16, stride, 9);\
1877     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1878     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1879     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1880 }\
1881 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1882     uint8_t full[16*9];\
1883     uint8_t halfH[72];\
1884     uint8_t halfV[64];\
1885     uint8_t halfHV[64];\
1886     copy_block9(full, src, 16, stride, 9);\
1887     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1888     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1889     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1890     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1891 }\
1892 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1893     uint8_t full[16*9];\
1894     uint8_t halfH[72];\
1895     copy_block9(full, src, 16, stride, 9);\
1896     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1897     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1898     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1899 }\
1900 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1901     uint8_t halfH[72];\
1902     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1903     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1904 }\
1905 \
1906 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1907     uint8_t half[256];\
1908     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1909     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1910 }\
1911 \
1912 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1913     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1914 }\
1915 \
1916 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1917     uint8_t half[256];\
1918     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1919     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1920 }\
1921 \
1922 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1923     uint8_t full[24*17];\
1924     uint8_t half[256];\
1925     copy_block17(full, src, 24, stride, 17);\
1926     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1927     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1928 }\
1929 \
1930 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1931     uint8_t full[24*17];\
1932     copy_block17(full, src, 24, stride, 17);\
1933     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1934 }\
1935 \
1936 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1937     uint8_t full[24*17];\
1938     uint8_t half[256];\
1939     copy_block17(full, src, 24, stride, 17);\
1940     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1941     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1942 }\
1943 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1944     uint8_t full[24*17];\
1945     uint8_t halfH[272];\
1946     uint8_t halfV[256];\
1947     uint8_t halfHV[256];\
1948     copy_block17(full, src, 24, stride, 17);\
1949     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1950     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1951     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1952     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1953 }\
1954 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1955     uint8_t full[24*17];\
1956     uint8_t halfH[272];\
1957     uint8_t halfHV[256];\
1958     copy_block17(full, src, 24, stride, 17);\
1959     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1960     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1961     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1962     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1963 }\
1964 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1965     uint8_t full[24*17];\
1966     uint8_t halfH[272];\
1967     uint8_t halfV[256];\
1968     uint8_t halfHV[256];\
1969     copy_block17(full, src, 24, stride, 17);\
1970     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1971     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1972     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1973     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1974 }\
1975 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1976     uint8_t full[24*17];\
1977     uint8_t halfH[272];\
1978     uint8_t halfHV[256];\
1979     copy_block17(full, src, 24, stride, 17);\
1980     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1981     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1982     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1983     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1984 }\
1985 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1986     uint8_t full[24*17];\
1987     uint8_t halfH[272];\
1988     uint8_t halfV[256];\
1989     uint8_t halfHV[256];\
1990     copy_block17(full, src, 24, stride, 17);\
1991     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1992     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1993     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1994     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1995 }\
1996 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1997     uint8_t full[24*17];\
1998     uint8_t halfH[272];\
1999     uint8_t halfHV[256];\
2000     copy_block17(full, src, 24, stride, 17);\
2001     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2002     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2003     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2004     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2005 }\
2006 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2007     uint8_t full[24*17];\
2008     uint8_t halfH[272];\
2009     uint8_t halfV[256];\
2010     uint8_t halfHV[256];\
2011     copy_block17(full, src, 24, stride, 17);\
2012     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2013     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2014     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2015     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2016 }\
2017 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2018     uint8_t full[24*17];\
2019     uint8_t halfH[272];\
2020     uint8_t halfHV[256];\
2021     copy_block17(full, src, 24, stride, 17);\
2022     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2023     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2024     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2025     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2026 }\
2027 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2028     uint8_t halfH[272];\
2029     uint8_t halfHV[256];\
2030     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2031     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2032     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2033 }\
2034 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2035     uint8_t halfH[272];\
2036     uint8_t halfHV[256];\
2037     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2038     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2039     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2040 }\
2041 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2042     uint8_t full[24*17];\
2043     uint8_t halfH[272];\
2044     uint8_t halfV[256];\
2045     uint8_t halfHV[256];\
2046     copy_block17(full, src, 24, stride, 17);\
2047     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2048     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2049     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2050     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2051 }\
2052 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2053     uint8_t full[24*17];\
2054     uint8_t halfH[272];\
2055     copy_block17(full, src, 24, stride, 17);\
2056     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2057     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2058     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2059 }\
2060 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2061     uint8_t full[24*17];\
2062     uint8_t halfH[272];\
2063     uint8_t halfV[256];\
2064     uint8_t halfHV[256];\
2065     copy_block17(full, src, 24, stride, 17);\
2066     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2067     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2068     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2069     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2070 }\
2071 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2072     uint8_t full[24*17];\
2073     uint8_t halfH[272];\
2074     copy_block17(full, src, 24, stride, 17);\
2075     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2076     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2077     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2078 }\
2079 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2080     uint8_t halfH[272];\
2081     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2082     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2083 }
2084
2085 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2086 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2087 #define op_put(a, b) a = cm[((b) + 16)>>5]
2088 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2089
2090 QPEL_MC(0, put_       , _       , op_put)
2091 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2092 QPEL_MC(0, avg_       , _       , op_avg)
2093 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2094 #undef op_avg
2095 #undef op_avg_no_rnd
2096 #undef op_put
2097 #undef op_put_no_rnd
2098
2099 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
2100 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
2101 #define put_qpel16_mc00_c ff_put_pixels16x16_c
2102 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2103 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
2104 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2105
2106 #if 1
2107 #define H264_LOWPASS(OPNAME, OP, OP2) \
2108 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2109     const int h=2;\
2110     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2111     int i;\
2112     for(i=0; i<h; i++)\
2113     {\
2114         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2115         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2116         dst+=dstStride;\
2117         src+=srcStride;\
2118     }\
2119 }\
2120 \
2121 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2122     const int w=2;\
2123     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2124     int i;\
2125     for(i=0; i<w; i++)\
2126     {\
2127         const int srcB= src[-2*srcStride];\
2128         const int srcA= src[-1*srcStride];\
2129         const int src0= src[0 *srcStride];\
2130         const int src1= src[1 *srcStride];\
2131         const int src2= src[2 *srcStride];\
2132         const int src3= src[3 *srcStride];\
2133         const int src4= src[4 *srcStride];\
2134         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2135         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2136         dst++;\
2137         src++;\
2138     }\
2139 }\
2140 \
2141 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2142     const int h=2;\
2143     const int w=2;\
2144     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2145     int i;\
2146     src -= 2*srcStride;\
2147     for(i=0; i<h+5; i++)\
2148     {\
2149         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2150         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2151         tmp+=tmpStride;\
2152         src+=srcStride;\
2153     }\
2154     tmp -= tmpStride*(h+5-2);\
2155     for(i=0; i<w; i++)\
2156     {\
2157         const int tmpB= tmp[-2*tmpStride];\
2158         const int tmpA= tmp[-1*tmpStride];\
2159         const int tmp0= tmp[0 *tmpStride];\
2160         const int tmp1= tmp[1 *tmpStride];\
2161         const int tmp2= tmp[2 *tmpStride];\
2162         const int tmp3= tmp[3 *tmpStride];\
2163         const int tmp4= tmp[4 *tmpStride];\
2164         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2165         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2166         dst++;\
2167         tmp++;\
2168     }\
2169 }\
2170 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2171     const int h=4;\
2172     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2173     int i;\
2174     for(i=0; i<h; i++)\
2175     {\
2176         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2177         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2178         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2179         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2180         dst+=dstStride;\
2181         src+=srcStride;\
2182     }\
2183 }\
2184 \
2185 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2186     const int w=4;\
2187     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2188     int i;\
2189     for(i=0; i<w; i++)\
2190     {\
2191         const int srcB= src[-2*srcStride];\
2192         const int srcA= src[-1*srcStride];\
2193         const int src0= src[0 *srcStride];\
2194         const int src1= src[1 *srcStride];\
2195         const int src2= src[2 *srcStride];\
2196         const int src3= src[3 *srcStride];\
2197         const int src4= src[4 *srcStride];\
2198         const int src5= src[5 *srcStride];\
2199         const int src6= src[6 *srcStride];\
2200         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2201         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2202         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2203         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2204         dst++;\
2205         src++;\
2206     }\
2207 }\
2208 \
2209 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2210     const int h=4;\
2211     const int w=4;\
2212     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2213     int i;\
2214     src -= 2*srcStride;\
2215     for(i=0; i<h+5; i++)\
2216     {\
2217         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2218         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2219         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2220         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2221         tmp+=tmpStride;\
2222         src+=srcStride;\
2223     }\
2224     tmp -= tmpStride*(h+5-2);\
2225     for(i=0; i<w; i++)\
2226     {\
2227         const int tmpB= tmp[-2*tmpStride];\
2228         const int tmpA= tmp[-1*tmpStride];\
2229         const int tmp0= tmp[0 *tmpStride];\
2230         const int tmp1= tmp[1 *tmpStride];\
2231         const int tmp2= tmp[2 *tmpStride];\
2232         const int tmp3= tmp[3 *tmpStride];\
2233         const int tmp4= tmp[4 *tmpStride];\
2234         const int tmp5= tmp[5 *tmpStride];\
2235         const int tmp6= tmp[6 *tmpStride];\
2236         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2237         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2238         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2239         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2240         dst++;\
2241         tmp++;\
2242     }\
2243 }\
2244 \
2245 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2246     const int h=8;\
2247     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2248     int i;\
2249     for(i=0; i<h; i++)\
2250     {\
2251         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2252         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2253         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2254         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2255         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2256         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2257         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2258         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2259         dst+=dstStride;\
2260         src+=srcStride;\
2261     }\
2262 }\
2263 \
2264 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2265     const int w=8;\
2266     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2267     int i;\
2268     for(i=0; i<w; i++)\
2269     {\
2270         const int srcB= src[-2*srcStride];\
2271         const int srcA= src[-1*srcStride];\
2272         const int src0= src[0 *srcStride];\
2273         const int src1= src[1 *srcStride];\
2274         const int src2= src[2 *srcStride];\
2275         const int src3= src[3 *srcStride];\
2276         const int src4= src[4 *srcStride];\
2277         const int src5= src[5 *srcStride];\
2278         const int src6= src[6 *srcStride];\
2279         const int src7= src[7 *srcStride];\
2280         const int src8= src[8 *srcStride];\
2281         const int src9= src[9 *srcStride];\
2282         const int src10=src[10*srcStride];\
2283         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2284         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2285         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2286         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2287         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2288         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2289         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2290         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2291         dst++;\
2292         src++;\
2293     }\
2294 }\
2295 \
2296 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2297     const int h=8;\
2298     const int w=8;\
2299     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2300     int i;\
2301     src -= 2*srcStride;\
2302     for(i=0; i<h+5; i++)\
2303     {\
2304         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2305         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2306         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2307         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2308         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2309         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2310         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2311         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2312         tmp+=tmpStride;\
2313         src+=srcStride;\
2314     }\
2315     tmp -= tmpStride*(h+5-2);\
2316     for(i=0; i<w; i++)\
2317     {\
2318         const int tmpB= tmp[-2*tmpStride];\
2319         const int tmpA= tmp[-1*tmpStride];\
2320         const int tmp0= tmp[0 *tmpStride];\
2321         const int tmp1= tmp[1 *tmpStride];\
2322         const int tmp2= tmp[2 *tmpStride];\
2323         const int tmp3= tmp[3 *tmpStride];\
2324         const int tmp4= tmp[4 *tmpStride];\
2325         const int tmp5= tmp[5 *tmpStride];\
2326         const int tmp6= tmp[6 *tmpStride];\
2327         const int tmp7= tmp[7 *tmpStride];\
2328         const int tmp8= tmp[8 *tmpStride];\
2329         const int tmp9= tmp[9 *tmpStride];\
2330         const int tmp10=tmp[10*tmpStride];\
2331         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2332         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2333         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2334         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2335         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2336         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2337         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2338         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2339         dst++;\
2340         tmp++;\
2341     }\
2342 }\
2343 \
2344 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2345     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2346     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2347     src += 8*srcStride;\
2348     dst += 8*dstStride;\
2349     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2350     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2351 }\
2352 \
2353 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2354     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2355     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2356     src += 8*srcStride;\
2357     dst += 8*dstStride;\
2358     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2359     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2360 }\
2361 \
2362 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2363     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2364     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2365     src += 8*srcStride;\
2366     dst += 8*dstStride;\
2367     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2368     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2369 }\
2370
2371 #define H264_MC(OPNAME, SIZE) \
2372 static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2373     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2374 }\
2375 \
2376 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2377     uint8_t half[SIZE*SIZE];\
2378     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2379     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2380 }\
2381 \
2382 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2383     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2384 }\
2385 \
2386 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2387     uint8_t half[SIZE*SIZE];\
2388     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2389     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2390 }\
2391 \
2392 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2393     uint8_t full[SIZE*(SIZE+5)];\
2394     uint8_t * const full_mid= full + SIZE*2;\
2395     uint8_t half[SIZE*SIZE];\
2396     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2397     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2398     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2399 }\
2400 \
2401 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2402     uint8_t full[SIZE*(SIZE+5)];\
2403     uint8_t * const full_mid= full + SIZE*2;\
2404     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2405     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2406 }\
2407 \
2408 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2409     uint8_t full[SIZE*(SIZE+5)];\
2410     uint8_t * const full_mid= full + SIZE*2;\
2411     uint8_t half[SIZE*SIZE];\
2412     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2413     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2414     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2415 }\
2416 \
2417 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2418     uint8_t full[SIZE*(SIZE+5)];\
2419     uint8_t * const full_mid= full + SIZE*2;\
2420     uint8_t halfH[SIZE*SIZE];\
2421     uint8_t halfV[SIZE*SIZE];\
2422     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2423     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2424     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2425     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2426 }\
2427 \
2428 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2429     uint8_t full[SIZE*(SIZE+5)];\
2430     uint8_t * const full_mid= full + SIZE*2;\
2431     uint8_t halfH[SIZE*SIZE];\
2432     uint8_t halfV[SIZE*SIZE];\
2433     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2434     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2435     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2436     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2437 }\
2438 \
2439 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2440     uint8_t full[SIZE*(SIZE+5)];\
2441     uint8_t * const full_mid= full + SIZE*2;\
2442     uint8_t halfH[SIZE*SIZE];\
2443     uint8_t halfV[SIZE*SIZE];\
2444     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2445     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2446     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2447     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2448 }\
2449 \
2450 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2451     uint8_t full[SIZE*(SIZE+5)];\
2452     uint8_t * const full_mid= full + SIZE*2;\
2453     uint8_t halfH[SIZE*SIZE];\
2454     uint8_t halfV[SIZE*SIZE];\
2455     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2456     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2457     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2458     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2459 }\
2460 \
2461 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2462     int16_t tmp[SIZE*(SIZE+5)];\
2463     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2464 }\
2465 \
2466 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2467     int16_t tmp[SIZE*(SIZE+5)];\
2468     uint8_t halfH[SIZE*SIZE];\
2469     uint8_t halfHV[SIZE*SIZE];\
2470     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2471     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2472     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2473 }\
2474 \
2475 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2476     int16_t tmp[SIZE*(SIZE+5)];\
2477     uint8_t halfH[SIZE*SIZE];\
2478     uint8_t halfHV[SIZE*SIZE];\
2479     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2480     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2481     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2482 }\
2483 \
2484 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2485     uint8_t full[SIZE*(SIZE+5)];\
2486     uint8_t * const full_mid= full + SIZE*2;\
2487     int16_t tmp[SIZE*(SIZE+5)];\
2488     uint8_t halfV[SIZE*SIZE];\
2489     uint8_t halfHV[SIZE*SIZE];\
2490     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2491     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2492     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2493     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2494 }\
2495 \
2496 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2497     uint8_t full[SIZE*(SIZE+5)];\
2498     uint8_t * const full_mid= full + SIZE*2;\
2499     int16_t tmp[SIZE*(SIZE+5)];\
2500     uint8_t halfV[SIZE*SIZE];\
2501     uint8_t halfHV[SIZE*SIZE];\
2502     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2503     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2504     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2505     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2506 }\
2507
2508 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2509 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2510 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2511 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2512 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2513
2514 H264_LOWPASS(put_       , op_put, op2_put)
2515 H264_LOWPASS(avg_       , op_avg, op2_avg)
2516 H264_MC(put_, 2)
2517 H264_MC(put_, 4)
2518 H264_MC(put_, 8)
2519 H264_MC(put_, 16)
2520 H264_MC(avg_, 4)
2521 H264_MC(avg_, 8)
2522 H264_MC(avg_, 16)
2523
2524 #undef op_avg
2525 #undef op_put
2526 #undef op2_avg
2527 #undef op2_put
2528 #endif
2529
2530 #define put_h264_qpel8_mc00_c  ff_put_pixels8x8_c
2531 #define avg_h264_qpel8_mc00_c  ff_avg_pixels8x8_c
2532 #define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2533 #define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2534
2535 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2536     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2537     int i;
2538
2539     for(i=0; i<h; i++){
2540         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2541         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2542         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2543         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2544         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2545         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2546         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2547         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2548         dst+=dstStride;
2549         src+=srcStride;
2550     }
2551 }
2552
2553 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2554     put_pixels8_c(dst, src, stride, 8);
2555 }
2556 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2557     avg_pixels8_c(dst, src, stride, 8);
2558 }
2559 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2560     put_pixels16_c(dst, src, stride, 16);
2561 }
2562 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2563     avg_pixels16_c(dst, src, stride, 16);
2564 }
2565
2566 #if CONFIG_RV40_DECODER
2567 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2568     put_pixels16_xy2_c(dst, src, stride, 16);
2569 }
2570 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2571     avg_pixels16_xy2_c(dst, src, stride, 16);
2572 }
2573 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2574     put_pixels8_xy2_c(dst, src, stride, 8);
2575 }
2576 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2577     avg_pixels8_xy2_c(dst, src, stride, 8);
2578 }
2579 #endif /* CONFIG_RV40_DECODER */
2580
2581 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2582     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2583     int i;
2584
2585     for(i=0; i<w; i++){
2586         const int src_1= src[ -srcStride];
2587         const int src0 = src[0          ];
2588         const int src1 = src[  srcStride];
2589         const int src2 = src[2*srcStride];
2590         const int src3 = src[3*srcStride];
2591         const int src4 = src[4*srcStride];
2592         const int src5 = src[5*srcStride];
2593         const int src6 = src[6*srcStride];
2594         const int src7 = src[7*srcStride];
2595         const int src8 = src[8*srcStride];
2596         const int src9 = src[9*srcStride];
2597         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2598         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2599         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2600         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2601         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2602         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2603         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2604         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2605         src++;
2606         dst++;
2607     }
2608 }
2609
2610 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2611     uint8_t half[64];
2612     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2613     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2614 }
2615
2616 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2617     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2618 }
2619
2620 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2621     uint8_t half[64];
2622     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2623     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2624 }
2625
2626 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2627     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2628 }
2629
2630 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2631     uint8_t halfH[88];
2632     uint8_t halfV[64];
2633     uint8_t halfHV[64];
2634     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2635     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2636     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2637     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2638 }
2639 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2640     uint8_t halfH[88];
2641     uint8_t halfV[64];
2642     uint8_t halfHV[64];
2643     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2644     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2645     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2646     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2647 }
2648 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2649     uint8_t halfH[88];
2650     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2651     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2652 }
2653
2654 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2655     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2656     int x;
2657     const int strength= ff_h263_loop_filter_strength[qscale];
2658
2659     for(x=0; x<8; x++){
2660         int d1, d2, ad1;
2661         int p0= src[x-2*stride];
2662         int p1= src[x-1*stride];
2663         int p2= src[x+0*stride];
2664         int p3= src[x+1*stride];
2665         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2666
2667         if     (d<-2*strength) d1= 0;
2668         else if(d<-  strength) d1=-2*strength - d;
2669         else if(d<   strength) d1= d;
2670         else if(d< 2*strength) d1= 2*strength - d;
2671         else                   d1= 0;
2672
2673         p1 += d1;
2674         p2 -= d1;
2675         if(p1&256) p1= ~(p1>>31);
2676         if(p2&256) p2= ~(p2>>31);
2677
2678         src[x-1*stride] = p1;
2679         src[x+0*stride] = p2;
2680
2681         ad1= FFABS(d1)>>1;
2682
2683         d2= av_clip((p0-p3)/4, -ad1, ad1);
2684
2685         src[x-2*stride] = p0 - d2;
2686         src[x+  stride] = p3 + d2;
2687     }
2688     }
2689 }
2690
2691 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2692     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2693     int y;
2694     const int strength= ff_h263_loop_filter_strength[qscale];
2695
2696     for(y=0; y<8; y++){
2697         int d1, d2, ad1;
2698         int p0= src[y*stride-2];
2699         int p1= src[y*stride-1];
2700         int p2= src[y*stride+0];
2701         int p3= src[y*stride+1];
2702         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2703
2704         if     (d<-2*strength) d1= 0;
2705         else if(d<-  strength) d1=-2*strength - d;
2706         else if(d<   strength) d1= d;
2707         else if(d< 2*strength) d1= 2*strength - d;
2708         else                   d1= 0;
2709
2710         p1 += d1;
2711         p2 -= d1;
2712         if(p1&256) p1= ~(p1>>31);
2713         if(p2&256) p2= ~(p2>>31);
2714
2715         src[y*stride-1] = p1;
2716         src[y*stride+0] = p2;
2717
2718         ad1= FFABS(d1)>>1;
2719
2720         d2= av_clip((p0-p3)/4, -ad1, ad1);
2721
2722         src[y*stride-2] = p0 - d2;
2723         src[y*stride+1] = p3 + d2;
2724     }
2725     }
2726 }
2727
2728 static void h261_loop_filter_c(uint8_t *src, int stride){
2729     int x,y,xy,yz;
2730     int temp[64];
2731
2732     for(x=0; x<8; x++){
2733         temp[x      ] = 4*src[x           ];
2734         temp[x + 7*8] = 4*src[x + 7*stride];
2735     }
2736     for(y=1; y<7; y++){
2737         for(x=0; x<8; x++){
2738             xy = y * stride + x;
2739             yz = y * 8 + x;
2740             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2741         }
2742     }
2743
2744     for(y=0; y<8; y++){
2745         src[  y*stride] = (temp[  y*8] + 2)>>2;
2746         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2747         for(x=1; x<7; x++){
2748             xy = y * stride + x;
2749             yz = y * 8 + x;
2750             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2751         }
2752     }
2753 }
2754
2755 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2756 {
2757     int s, i;
2758
2759     s = 0;
2760     for(i=0;i<h;i++) {
2761         s += abs(pix1[0] - pix2[0]);
2762         s += abs(pix1[1] - pix2[1]);
2763         s += abs(pix1[2] - pix2[2]);
2764         s += abs(pix1[3] - pix2[3]);
2765         s += abs(pix1[4] - pix2[4]);
2766         s += abs(pix1[5] - pix2[5]);
2767         s += abs(pix1[6] - pix2[6]);
2768         s += abs(pix1[7] - pix2[7]);
2769         s += abs(pix1[8] - pix2[8]);
2770         s += abs(pix1[9] - pix2[9]);
2771         s += abs(pix1[10] - pix2[10]);
2772         s += abs(pix1[11] - pix2[11]);
2773         s += abs(pix1[12] - pix2[12]);
2774         s += abs(pix1[13] - pix2[13]);
2775         s += abs(pix1[14] - pix2[14]);
2776         s += abs(pix1[15] - pix2[15]);
2777         pix1 += line_size;
2778         pix2 += line_size;
2779     }
2780     return s;
2781 }
2782
2783 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2784 {
2785     int s, i;
2786
2787     s = 0;
2788     for(i=0;i<h;i++) {
2789         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2790         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2791         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2792         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2793         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2794         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2795         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2796         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2797         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2798         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2799         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2800         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2801         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2802         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2803         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2804         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2805         pix1 += line_size;
2806         pix2 += line_size;
2807     }
2808     return s;
2809 }
2810
2811 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2812 {
2813     int s, i;
2814     uint8_t *pix3 = pix2 + line_size;
2815
2816     s = 0;
2817     for(i=0;i<h;i++) {
2818         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2819         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2820         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2821         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2822         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2823         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2824         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2825         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2826         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2827         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2828         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2829         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2830         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2831         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2832         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2833         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2834         pix1 += line_size;
2835         pix2 += line_size;
2836         pix3 += line_size;
2837     }
2838     return s;
2839 }
2840
2841 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2842 {
2843     int s, i;
2844     uint8_t *pix3 = pix2 + line_size;
2845
2846     s = 0;
2847     for(i=0;i<h;i++) {
2848         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2849         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2850         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2851         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2852         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2853         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2854         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2855         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2856         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2857         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2858         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2859         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2860         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2861         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2862         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2863         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2864         pix1 += line_size;
2865         pix2 += line_size;
2866         pix3 += line_size;
2867     }
2868     return s;
2869 }
2870
2871 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2872 {
2873     int s, i;
2874
2875     s = 0;
2876     for(i=0;i<h;i++) {
2877         s += abs(pix1[0] - pix2[0]);
2878         s += abs(pix1[1] - pix2[1]);
2879         s += abs(pix1[2] - pix2[2]);
2880         s += abs(pix1[3] - pix2[3]);
2881         s += abs(pix1[4] - pix2[4]);
2882         s += abs(pix1[5] - pix2[5]);
2883         s += abs(pix1[6] - pix2[6]);
2884         s += abs(pix1[7] - pix2[7]);
2885         pix1 += line_size;
2886         pix2 += line_size;
2887     }
2888     return s;
2889 }
2890
2891 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2892 {
2893     int s, i;
2894
2895     s = 0;
2896     for(i=0;i<h;i++) {
2897         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2898         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2899         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2900         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2901         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2902         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2903         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2904         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2905         pix1 += line_size;
2906         pix2 += line_size;
2907     }
2908     return s;
2909 }
2910
2911 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2912 {
2913     int s, i;
2914     uint8_t *pix3 = pix2 + line_size;
2915
2916     s = 0;
2917     for(i=0;i<h;i++) {
2918         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2919         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2920         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2921         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2922         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2923         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2924         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2925         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2926         pix1 += line_size;
2927         pix2 += line_size;
2928         pix3 += line_size;
2929     }
2930     return s;
2931 }
2932
2933 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2934 {
2935     int s, i;
2936     uint8_t *pix3 = pix2 + line_size;
2937
2938     s = 0;
2939     for(i=0;i<h;i++) {
2940         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2941         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2942         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2943         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2944         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2945         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2946         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2947         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2948         pix1 += line_size;
2949         pix2 += line_size;
2950         pix3 += line_size;
2951     }
2952     return s;
2953 }
2954
2955 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2956     MpegEncContext *c = v;
2957     int score1=0;
2958     int score2=0;
2959     int x,y;
2960
2961     for(y=0; y<h; y++){
2962         for(x=0; x<16; x++){
2963             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2964         }
2965         if(y+1<h){
2966             for(x=0; x<15; x++){
2967                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
2968                              - s1[x+1] + s1[x+1+stride])
2969                         -FFABS(  s2[x  ] - s2[x  +stride]
2970                              - s2[x+1] + s2[x+1+stride]);
2971             }
2972         }
2973         s1+= stride;
2974         s2+= stride;
2975     }
2976
2977     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
2978     else  return score1 + FFABS(score2)*8;
2979 }
2980
2981 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2982     MpegEncContext *c = v;
2983     int score1=0;
2984     int score2=0;
2985     int x,y;
2986
2987     for(y=0; y<h; y++){
2988         for(x=0; x<8; x++){
2989             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2990         }
2991         if(y+1<h){
2992             for(x=0; x<7; x++){
2993                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
2994                              - s1[x+1] + s1[x+1+stride])
2995                         -FFABS(  s2[x  ] - s2[x  +stride]
2996                              - s2[x+1] + s2[x+1+stride]);
2997             }
2998         }
2999         s1+= stride;
3000         s2+= stride;
3001     }
3002
3003     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3004     else  return score1 + FFABS(score2)*8;
3005 }
3006
3007 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3008     int i;
3009     unsigned int sum=0;
3010
3011     for(i=0; i<8*8; i++){
3012         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3013         int w= weight[i];
3014         b>>= RECON_SHIFT;
3015         assert(-512<b && b<512);
3016
3017         sum += (w*b)*(w*b)>>4;
3018     }
3019     return sum>>2;
3020 }
3021
3022 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3023     int i;
3024
3025     for(i=0; i<8*8; i++){
3026         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3027     }
3028 }
3029
3030 /**
3031  * permutes an 8x8 block.
3032  * @param block the block which will be permuted according to the given permutation vector
3033  * @param permutation the permutation vector
3034  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3035  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3036  *                  (inverse) permutated to scantable order!
3037  */
3038 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3039 {
3040     int i;
3041     DCTELEM temp[64];
3042
3043     if(last<=0) return;
3044     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3045
3046     for(i=0; i<=last; i++){
3047         const int j= scantable[i];
3048         temp[j]= block[j];
3049         block[j]=0;
3050     }
3051
3052     for(i=0; i<=last; i++){
3053         const int j= scantable[i];
3054         const int perm_j= permutation[j];
3055         block[perm_j]= temp[j];
3056     }
3057 }
3058
3059 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3060     return 0;
3061 }
3062
3063 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3064     int i;
3065
3066     memset(cmp, 0, sizeof(void*)*6);
3067
3068     for(i=0; i<6; i++){
3069         switch(type&0xFF){
3070         case FF_CMP_SAD:
3071             cmp[i]= c->sad[i];
3072             break;
3073         case FF_CMP_SATD:
3074             cmp[i]= c->hadamard8_diff[i];
3075             break;
3076         case FF_CMP_SSE:
3077             cmp[i]= c->sse[i];
3078             break;
3079         case FF_CMP_DCT:
3080             cmp[i]= c->dct_sad[i];
3081             break;
3082         case FF_CMP_DCT264:
3083             cmp[i]= c->dct264_sad[i];
3084             break;
3085         case FF_CMP_DCTMAX:
3086             cmp[i]= c->dct_max[i];
3087             break;
3088         case FF_CMP_PSNR:
3089             cmp[i]= c->quant_psnr[i];
3090             break;
3091         case FF_CMP_BIT:
3092             cmp[i]= c->bit[i];
3093             break;
3094         case FF_CMP_RD:
3095             cmp[i]= c->rd[i];
3096             break;
3097         case FF_CMP_VSAD:
3098             cmp[i]= c->vsad[i];
3099             break;
3100         case FF_CMP_VSSE:
3101             cmp[i]= c->vsse[i];
3102             break;
3103         case FF_CMP_ZERO:
3104             cmp[i]= zero_cmp;
3105             break;
3106         case FF_CMP_NSSE:
3107             cmp[i]= c->nsse[i];
3108             break;
3109 #if CONFIG_DWT
3110         case FF_CMP_W53:
3111             cmp[i]= c->w53[i];
3112             break;
3113         case FF_CMP_W97:
3114             cmp[i]= c->w97[i];
3115             break;
3116 #endif
3117         default:
3118             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3119         }
3120     }
3121 }
3122
3123 static void clear_block_c(DCTELEM *block)
3124 {
3125     memset(block, 0, sizeof(DCTELEM)*64);
3126 }
3127
3128 /**
3129  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3130  */
3131 static void clear_blocks_c(DCTELEM *blocks)
3132 {
3133     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3134 }
3135
3136 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3137     long i;
3138     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3139         long a = *(long*)(src+i);
3140         long b = *(long*)(dst+i);
3141         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3142     }
3143     for(; i<w; i++)
3144         dst[i+0] += src[i+0];
3145 }
3146
3147 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3148     long i;
3149     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3150         long a = *(long*)(src1+i);
3151         long b = *(long*)(src2+i);
3152         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3153     }
3154     for(; i<w; i++)
3155         dst[i] = src1[i]+src2[i];
3156 }
3157
3158 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3159     long i;
3160 #if !HAVE_FAST_UNALIGNED
3161     if((long)src2 & (sizeof(long)-1)){
3162         for(i=0; i+7<w; i+=8){
3163             dst[i+0] = src1[i+0]-src2[i+0];
3164             dst[i+1] = src1[i+1]-src2[i+1];
3165             dst[i+2] = src1[i+2]-src2[i+2];
3166             dst[i+3] = src1[i+3]-src2[i+3];
3167             dst[i+4] = src1[i+4]-src2[i+4];
3168             dst[i+5] = src1[i+5]-src2[i+5];
3169             dst[i+6] = src1[i+6]-src2[i+6];
3170             dst[i+7] = src1[i+7]-src2[i+7];
3171         }
3172     }else
3173 #endif
3174     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3175         long a = *(long*)(src1+i);
3176         long b = *(long*)(src2+i);
3177         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3178     }
3179     for(; i<w; i++)
3180         dst[i+0] = src1[i+0]-src2[i+0];
3181 }
3182
3183 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3184     int i;
3185     uint8_t l, lt;
3186
3187     l= *left;
3188     lt= *left_top;
3189
3190     for(i=0; i<w; i++){
3191         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3192         lt= src1[i];
3193         dst[i]= l;
3194     }
3195
3196     *left= l;
3197     *left_top= lt;
3198 }
3199
3200 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3201     int i;
3202     uint8_t l, lt;
3203
3204     l= *left;
3205     lt= *left_top;
3206
3207     for(i=0; i<w; i++){
3208         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3209         lt= src1[i];
3210         l= src2[i];
3211         dst[i]= l - pred;
3212     }
3213
3214     *left= l;
3215     *left_top= lt;
3216 }
3217
3218 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3219     int i;
3220
3221     for(i=0; i<w-1; i++){
3222         acc+= src[i];
3223         dst[i]= acc;
3224         i++;
3225         acc+= src[i];
3226         dst[i]= acc;
3227     }
3228
3229     for(; i<w; i++){
3230         acc+= src[i];
3231         dst[i]= acc;
3232     }
3233
3234     return acc;
3235 }
3236
3237 #if HAVE_BIGENDIAN
3238 #define B 3
3239 #define G 2
3240 #define R 1
3241 #define A 0
3242 #else
3243 #define B 0
3244 #define G 1
3245 #define R 2
3246 #define A 3
3247 #endif
3248 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3249     int i;
3250     int r,g,b,a;
3251     r= *red;
3252     g= *green;
3253     b= *blue;
3254     a= *alpha;
3255
3256     for(i=0; i<w; i++){
3257         b+= src[4*i+B];
3258         g+= src[4*i+G];
3259         r+= src[4*i+R];
3260         a+= src[4*i+A];
3261
3262         dst[4*i+B]= b;
3263         dst[4*i+G]= g;
3264         dst[4*i+R]= r;
3265         dst[4*i+A]= a;
3266     }
3267
3268     *red= r;
3269     *green= g;
3270     *blue= b;
3271     *alpha= a;
3272 }
3273 #undef B
3274 #undef G
3275 #undef R
3276 #undef A
3277
3278 #define BUTTERFLY2(o1,o2,i1,i2) \
3279 o1= (i1)+(i2);\
3280 o2= (i1)-(i2);
3281
3282 #define BUTTERFLY1(x,y) \
3283 {\
3284     int a,b;\
3285     a= x;\
3286     b= y;\
3287     x= a+b;\
3288     y= a-b;\
3289 }
3290
3291 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3292
3293 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3294     int i;
3295     int temp[64];
3296     int sum=0;
3297
3298     assert(h==8);
3299
3300     for(i=0; i<8; i++){
3301         //FIXME try pointer walks
3302         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3303         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3304         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3305         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3306
3307         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3308         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3309         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3310         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3311
3312         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3313         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3314         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3315         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3316     }
3317
3318     for(i=0; i<8; i++){
3319         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3320         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3321         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3322         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3323
3324         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3325         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3326         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3327         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3328
3329         sum +=
3330              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3331             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3332             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3333             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3334     }
3335 #if 0
3336 static int maxi=0;
3337 if(sum>maxi){
3338     maxi=sum;
3339     printf("MAX:%d\n", maxi);
3340 }
3341 #endif
3342     return sum;
3343 }
3344
3345 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3346     int i;
3347     int temp[64];
3348     int sum=0;
3349
3350     assert(h==8);
3351
3352     for(i=0; i<8; i++){
3353         //FIXME try pointer walks
3354         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3355         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3356         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3357         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3358
3359         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3360         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3361         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3362         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3363
3364         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3365         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3366         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3367         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3368     }
3369
3370     for(i=0; i<8; i++){
3371         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3372         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3373         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3374         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3375
3376         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3377         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3378         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3379         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3380
3381         sum +=
3382              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3383             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3384             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3385             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3386     }
3387
3388     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3389
3390     return sum;
3391 }
3392
3393 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3394     MpegEncContext * const s= (MpegEncContext *)c;
3395     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3396
3397     assert(h==8);
3398
3399     s->dsp.diff_pixels(temp, src1, src2, stride);
3400     s->dsp.fdct(temp);
3401     return s->dsp.sum_abs_dctelem(temp);
3402 }
3403
3404 #if CONFIG_GPL
3405 #define DCT8_1D {\
3406     const int s07 = SRC(0) + SRC(7);\
3407     const int s16 = SRC(1) + SRC(6);\
3408     const int s25 = SRC(2) + SRC(5);\
3409     const int s34 = SRC(3) + SRC(4);\
3410     const int a0 = s07 + s34;\
3411     const int a1 = s16 + s25;\
3412     const int a2 = s07 - s34;\
3413     const int a3 = s16 - s25;\
3414     const int d07 = SRC(0) - SRC(7);\
3415     const int d16 = SRC(1) - SRC(6);\
3416     const int d25 = SRC(2) - SRC(5);\
3417     const int d34 = SRC(3) - SRC(4);\
3418     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3419     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3420     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3421     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3422     DST(0,  a0 + a1     ) ;\
3423     DST(1,  a4 + (a7>>2)) ;\
3424     DST(2,  a2 + (a3>>1)) ;\
3425     DST(3,  a5 + (a6>>2)) ;\
3426     DST(4,  a0 - a1     ) ;\
3427     DST(5,  a6 - (a5>>2)) ;\
3428     DST(6, (a2>>1) - a3 ) ;\
3429     DST(7, (a4>>2) - a7 ) ;\
3430 }
3431
3432 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3433     MpegEncContext * const s= (MpegEncContext *)c;
3434     DCTELEM dct[8][8];
3435     int i;
3436     int sum=0;
3437
3438     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3439
3440 #define SRC(x) dct[i][x]
3441 #define DST(x,v) dct[i][x]= v
3442     for( i = 0; i < 8; i++ )
3443         DCT8_1D
3444 #undef SRC
3445 #undef DST
3446
3447 #define SRC(x) dct[x][i]
3448 #define DST(x,v) sum += FFABS(v)
3449     for( i = 0; i < 8; i++ )
3450         DCT8_1D
3451 #undef SRC
3452 #undef DST
3453     return sum;
3454 }
3455 #endif
3456
3457 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3458     MpegEncContext * const s= (MpegEncContext *)c;
3459     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3460     int sum=0, i;
3461
3462     assert(h==8);
3463
3464     s->dsp.diff_pixels(temp, src1, src2, stride);
3465     s->dsp.fdct(temp);
3466
3467     for(i=0; i<64; i++)
3468         sum= FFMAX(sum, FFABS(temp[i]));
3469
3470     return sum;
3471 }
3472
3473 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3474     MpegEncContext * const s= (MpegEncContext *)c;
3475     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3476     DCTELEM * const bak = temp+64;
3477     int sum=0, i;
3478
3479     assert(h==8);
3480     s->mb_intra=0;
3481
3482     s->dsp.diff_pixels(temp, src1, src2, stride);
3483
3484     memcpy(bak, temp, 64*sizeof(DCTELEM));
3485
3486     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3487     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3488     ff_simple_idct(temp); //FIXME
3489
3490     for(i=0; i<64; i++)
3491         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3492
3493     return sum;
3494 }
3495
3496 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3497     MpegEncContext * const s= (MpegEncContext *)c;
3498     const uint8_t *scantable= s->intra_scantable.permutated;
3499     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3500     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3501     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3502     int i, last, run, bits, level, distortion, start_i;
3503     const int esc_length= s->ac_esc_length;
3504     uint8_t * length;
3505     uint8_t * last_length;
3506
3507     assert(h==8);
3508
3509     copy_block8(lsrc1, src1, 8, stride, 8);
3510     copy_block8(lsrc2, src2, 8, stride, 8);
3511
3512     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3513
3514     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3515
3516     bits=0;
3517
3518     if (s->mb_intra) {
3519         start_i = 1;
3520         length     = s->intra_ac_vlc_length;
3521         last_length= s->intra_ac_vlc_last_length;
3522         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3523     } else {
3524         start_i = 0;
3525         length     = s->inter_ac_vlc_length;
3526         last_length= s->inter_ac_vlc_last_length;
3527     }
3528
3529     if(last>=start_i){
3530         run=0;
3531         for(i=start_i; i<last; i++){
3532             int j= scantable[i];
3533             level= temp[j];
3534
3535             if(level){
3536                 level+=64;
3537                 if((level&(~127)) == 0){
3538                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3539                 }else
3540                     bits+= esc_length;
3541                 run=0;
3542             }else
3543                 run++;
3544         }
3545         i= scantable[last];
3546
3547         level= temp[i] + 64;
3548
3549         assert(level - 64);
3550
3551         if((level&(~127)) == 0){
3552             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3553         }else
3554             bits+= esc_length;
3555
3556     }
3557
3558     if(last>=0){
3559         if(s->mb_intra)
3560             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3561         else
3562             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3563     }
3564
3565     s->dsp.idct_add(lsrc2, 8, temp);
3566
3567     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3568
3569     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3570 }
3571
3572 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3573     MpegEncContext * const s= (MpegEncContext *)c;
3574     const uint8_t *scantable= s->intra_scantable.permutated;
3575     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3576     int i, last, run, bits, level, start_i;
3577     const int esc_length= s->ac_esc_length;
3578     uint8_t * length;
3579     uint8_t * last_length;
3580
3581     assert(h==8);
3582
3583     s->dsp.diff_pixels(temp, src1, src2, stride);
3584
3585     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3586
3587     bits=0;
3588
3589     if (s->mb_intra) {
3590         start_i = 1;
3591         length     = s->intra_ac_vlc_length;
3592         last_length= s->intra_ac_vlc_last_length;
3593         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3594     } else {
3595         start_i = 0;
3596         length     = s->inter_ac_vlc_length;
3597         last_length= s->inter_ac_vlc_last_length;
3598     }
3599
3600     if(last>=start_i){
3601         run=0;
3602         for(i=start_i; i<last; i++){
3603             int j= scantable[i];
3604             level= temp[j];
3605
3606             if(level){
3607                 level+=64;
3608                 if((level&(~127)) == 0){
3609                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3610                 }else
3611                     bits+= esc_length;
3612                 run=0;
3613             }else
3614                 run++;
3615         }
3616         i= scantable[last];
3617
3618         level= temp[i] + 64;
3619
3620         assert(level - 64);
3621
3622         if((level&(~127)) == 0){
3623             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3624         }else
3625             bits+= esc_length;
3626     }
3627
3628     return bits;
3629 }
3630
3631 #define VSAD_INTRA(size) \
3632 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3633     int score=0;                                                                                            \
3634     int x,y;                                                                                                \
3635                                                                                                             \
3636     for(y=1; y<h; y++){                                                                                     \
3637         for(x=0; x<size; x+=4){                                                                             \
3638             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3639                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3640         }                                                                                                   \
3641         s+= stride;                                                                                         \
3642     }                                                                                                       \
3643                                                                                                             \
3644     return score;                                                                                           \
3645 }
3646 VSAD_INTRA(8)
3647 VSAD_INTRA(16)
3648
3649 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3650     int score=0;
3651     int x,y;
3652
3653     for(y=1; y<h; y++){
3654         for(x=0; x<16; x++){
3655             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3656         }
3657         s1+= stride;
3658         s2+= stride;
3659     }
3660
3661     return score;
3662 }
3663
3664 #define SQ(a) ((a)*(a))
3665 #define VSSE_INTRA(size) \
3666 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3667     int score=0;                                                                                            \
3668     int x,y;                                                                                                \
3669                                                                                                             \
3670     for(y=1; y<h; y++){                                                                                     \
3671         for(x=0; x<size; x+=4){                                                                               \
3672             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3673                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3674         }                                                                                                   \
3675         s+= stride;                                                                                         \
3676     }                                                                                                       \
3677                                                                                                             \
3678     return score;                                                                                           \
3679 }
3680 VSSE_INTRA(8)
3681 VSSE_INTRA(16)
3682
3683 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3684     int score=0;
3685     int x,y;
3686
3687     for(y=1; y<h; y++){
3688         for(x=0; x<16; x++){
3689             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3690         }
3691         s1+= stride;
3692         s2+= stride;
3693     }
3694
3695     return score;
3696 }
3697
3698 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3699                                int size){
3700     int score=0;
3701     int i;
3702     for(i=0; i<size; i++)
3703         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3704     return score;
3705 }
3706
3707 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3708 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3709 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3710 #if CONFIG_GPL
3711 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3712 #endif
3713 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3714 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3715 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3716 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3717
3718 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3719     int i;
3720     for(i=0; i<len; i++)
3721         dst[i] = src0[i] * src1[i];
3722 }
3723
3724 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3725     int i;
3726     src1 += len-1;
3727     for(i=0; i<len; i++)
3728         dst[i] = src0[i] * src1[-i];
3729 }
3730
3731 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3732     int i;
3733     for(i=0; i<len; i++)
3734         dst[i] = src0[i] * src1[i] + src2[i];
3735 }
3736
3737 static void vector_fmul_window_c(float *dst, const float *src0,
3738                                  const float *src1, const float *win, int len)
3739 {
3740     int i,j;
3741     dst += len;
3742     win += len;
3743     src0+= len;
3744     for(i=-len, j=len-1; i<0; i++, j--) {
3745         float s0 = src0[i];
3746         float s1 = src1[j];
3747         float wi = win[i];
3748         float wj = win[j];
3749         dst[i] = s0*wj - s1*wi;
3750         dst[j] = s0*wi + s1*wj;
3751     }
3752 }
3753
3754 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3755                                  int len)
3756 {
3757     int i;
3758     for (i = 0; i < len; i++)
3759         dst[i] = src[i] * mul;
3760 }
3761
3762 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3763                                       const float **sv, float mul, int len)
3764 {
3765     int i;
3766     for (i = 0; i < len; i += 2, sv++) {
3767         dst[i  ] = src[i  ] * sv[0][0] * mul;
3768         dst[i+1] = src[i+1] * sv[0][1] * mul;
3769     }
3770 }
3771
3772 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3773                                       const float **sv, float mul, int len)
3774 {
3775     int i;
3776     for (i = 0; i < len; i += 4, sv++) {
3777         dst[i  ] = src[i  ] * sv[0][0] * mul;
3778         dst[i+1] = src[i+1] * sv[0][1] * mul;
3779         dst[i+2] = src[i+2] * sv[0][2] * mul;
3780         dst[i+3] = src[i+3] * sv[0][3] * mul;
3781     }
3782 }
3783
3784 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3785                                int len)
3786 {
3787     int i;
3788     for (i = 0; i < len; i += 2, sv++) {
3789         dst[i  ] = sv[0][0] * mul;
3790         dst[i+1] = sv[0][1] * mul;
3791     }
3792 }
3793
3794 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3795                                int len)
3796 {
3797     int i;
3798     for (i = 0; i < len; i += 4, sv++) {
3799         dst[i  ] = sv[0][0] * mul;
3800         dst[i+1] = sv[0][1] * mul;
3801         dst[i+2] = sv[0][2] * mul;
3802         dst[i+3] = sv[0][3] * mul;
3803     }
3804 }
3805
3806 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3807                                 int len)
3808 {
3809     int i;
3810     for (i = 0; i < len; i++) {
3811         float t = v1[i] - v2[i];
3812         v1[i] += v2[i];
3813         v2[i] = t;
3814     }
3815 }
3816
3817 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3818 {
3819     float p = 0.0;
3820     int i;
3821
3822     for (i = 0; i < len; i++)
3823         p += v1[i] * v2[i];
3824
3825     return p;
3826 }
3827
3828 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3829                    uint32_t maxi, uint32_t maxisign)
3830 {
3831
3832     if(a > mini) return mini;
3833     else if((a^(1<<31)) > maxisign) return maxi;
3834     else return a;
3835 }
3836
3837 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3838     int i;
3839     uint32_t mini = *(uint32_t*)min;
3840     uint32_t maxi = *(uint32_t*)max;
3841     uint32_t maxisign = maxi ^ (1<<31);
3842     uint32_t *dsti = (uint32_t*)dst;
3843     const uint32_t *srci = (const uint32_t*)src;
3844     for(i=0; i<len; i+=8) {
3845         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3846         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3847         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3848         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3849         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3850         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3851         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3852         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3853     }
3854 }
3855 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3856     int i;
3857     if(min < 0 && max > 0) {
3858         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3859     } else {
3860         for(i=0; i < len; i+=8) {
3861             dst[i    ] = av_clipf(src[i    ], min, max);
3862             dst[i + 1] = av_clipf(src[i + 1], min, max);
3863             dst[i + 2] = av_clipf(src[i + 2], min, max);
3864             dst[i + 3] = av_clipf(src[i + 3], min, max);
3865             dst[i + 4] = av_clipf(src[i + 4], min, max);
3866             dst[i + 5] = av_clipf(src[i + 5], min, max);
3867             dst[i + 6] = av_clipf(src[i + 6], min, max);
3868             dst[i + 7] = av_clipf(src[i + 7], min, max);
3869         }
3870     }
3871 }
3872
3873 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3874 {
3875     int res = 0;
3876
3877     while (order--)
3878         res += (*v1++ * *v2++) >> shift;
3879
3880     return res;
3881 }
3882
3883 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3884 {
3885     int res = 0;
3886     while (order--) {
3887         res   += *v1 * *v2++;
3888         *v1++ += mul * *v3++;
3889     }
3890     return res;
3891 }
3892
3893 #define W0 2048
3894 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3895 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3896 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3897 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3898 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3899 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3900 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3901
3902 static void wmv2_idct_row(short * b)
3903 {
3904     int s1,s2;
3905     int a0,a1,a2,a3,a4,a5,a6,a7;
3906     /*step 1*/
3907     a1 = W1*b[1]+W7*b[7];
3908     a7 = W7*b[1]-W1*b[7];
3909     a5 = W5*b[5]+W3*b[3];
3910     a3 = W3*b[5]-W5*b[3];
3911     a2 = W2*b[2]+W6*b[6];
3912     a6 = W6*b[2]-W2*b[6];
3913     a0 = W0*b[0]+W0*b[4];
3914     a4 = W0*b[0]-W0*b[4];
3915     /*step 2*/
3916     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3917     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3918     /*step 3*/
3919     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3920     b[1] = (a4+a6 +s1   + (1<<7))>>8;
3921     b[2] = (a4-a6 +s2   + (1<<7))>>8;
3922     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3923     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3924     b[5] = (a4-a6 -s2   + (1<<7))>>8;
3925     b[6] = (a4+a6 -s1   + (1<<7))>>8;
3926     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3927 }
3928 static void wmv2_idct_col(short * b)
3929 {
3930     int s1,s2;
3931     int a0,a1,a2,a3,a4,a5,a6,a7;
3932     /*step 1, with extended precision*/
3933     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3934     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3935     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3936     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3937     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3938     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3939     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
3940     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
3941     /*step 2*/
3942     s1 = (181*(a1-a5+a7-a3)+128)>>8;
3943     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3944     /*step 3*/
3945     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3946     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
3947     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
3948     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3949
3950     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3951     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
3952     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
3953     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3954 }
3955 void ff_wmv2_idct_c(short * block){
3956     int i;
3957
3958     for(i=0;i<64;i+=8){
3959         wmv2_idct_row(block+i);
3960     }
3961     for(i=0;i<8;i++){
3962         wmv2_idct_col(block+i);
3963     }
3964 }
3965 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3966  converted */
3967 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3968 {
3969     ff_wmv2_idct_c(block);
3970     ff_put_pixels_clamped_c(block, dest, line_size);
3971 }
3972 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3973 {
3974     ff_wmv2_idct_c(block);
3975     ff_add_pixels_clamped_c(block, dest, line_size);
3976 }
3977 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3978 {
3979     j_rev_dct (block);
3980     ff_put_pixels_clamped_c(block, dest, line_size);
3981 }
3982 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3983 {
3984     j_rev_dct (block);
3985     ff_add_pixels_clamped_c(block, dest, line_size);
3986 }
3987
3988 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3989 {
3990     j_rev_dct4 (block);
3991     put_pixels_clamped4_c(block, dest, line_size);
3992 }
3993 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3994 {
3995     j_rev_dct4 (block);
3996     add_pixels_clamped4_c(block, dest, line_size);
3997 }
3998
3999 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4000 {
4001     j_rev_dct2 (block);
4002     put_pixels_clamped2_c(block, dest, line_size);
4003 }
4004 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4005 {
4006     j_rev_dct2 (block);
4007     add_pixels_clamped2_c(block, dest, line_size);
4008 }
4009
4010 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4011 {
4012     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4013
4014     dest[0] = cm[(block[0] + 4)>>3];
4015 }
4016 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4017 {
4018     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4019
4020     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4021 }
4022
4023 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4024
4025 /* init static data */
4026 av_cold void dsputil_static_init(void)
4027 {
4028     int i;
4029
4030     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4031     for(i=0;i<MAX_NEG_CROP;i++) {
4032         ff_cropTbl[i] = 0;
4033         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4034     }
4035
4036     for(i=0;i<512;i++) {
4037         ff_squareTbl[i] = (i - 256) * (i - 256);
4038     }
4039
4040     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4041 }
4042
4043 int ff_check_alignment(void){
4044     static int did_fail=0;
4045     DECLARE_ALIGNED(16, int, aligned);
4046
4047     if((intptr_t)&aligned & 15){
4048         if(!did_fail){
4049 #if HAVE_MMX || HAVE_ALTIVEC
4050             av_log(NULL, AV_LOG_ERROR,
4051                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4052                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4053                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4054                 "Do not report crashes to FFmpeg developers.\n");
4055 #endif
4056             did_fail=1;
4057         }
4058         return -1;
4059     }
4060     return 0;
4061 }
4062
4063 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4064 {
4065     int i;
4066
4067     ff_check_alignment();
4068
4069 #if CONFIG_ENCODERS
4070     if(avctx->dct_algo==FF_DCT_FASTINT) {
4071         c->fdct = fdct_ifast;
4072         c->fdct248 = fdct_ifast248;
4073     }
4074     else if(avctx->dct_algo==FF_DCT_FAAN) {
4075         c->fdct = ff_faandct;
4076         c->fdct248 = ff_faandct248;
4077     }
4078     else {
4079         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4080         c->fdct248 = ff_fdct248_islow;
4081     }
4082 #endif //CONFIG_ENCODERS
4083
4084     if(avctx->lowres==1){
4085         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4086             c->idct_put= ff_jref_idct4_put;
4087             c->idct_add= ff_jref_idct4_add;
4088         }else{
4089             c->idct_put= ff_h264_lowres_idct_put_c;
4090             c->idct_add= ff_h264_lowres_idct_add_c;
4091         }
4092         c->idct    = j_rev_dct4;
4093         c->idct_permutation_type= FF_NO_IDCT_PERM;
4094     }else if(avctx->lowres==2){
4095         c->idct_put= ff_jref_idct2_put;
4096         c->idct_add= ff_jref_idct2_add;
4097         c->idct    = j_rev_dct2;
4098         c->idct_permutation_type= FF_NO_IDCT_PERM;
4099     }else if(avctx->lowres==3){
4100         c->idct_put= ff_jref_idct1_put;
4101         c->idct_add= ff_jref_idct1_add;
4102         c->idct    = j_rev_dct1;
4103         c->idct_permutation_type= FF_NO_IDCT_PERM;
4104     }else{
4105         if(avctx->idct_algo==FF_IDCT_INT){
4106             c->idct_put= ff_jref_idct_put;
4107             c->idct_add= ff_jref_idct_add;
4108             c->idct    = j_rev_dct;
4109             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4110         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4111                 avctx->idct_algo==FF_IDCT_VP3){
4112             c->idct_put= ff_vp3_idct_put_c;
4113             c->idct_add= ff_vp3_idct_add_c;
4114             c->idct    = ff_vp3_idct_c;
4115             c->idct_permutation_type= FF_NO_IDCT_PERM;
4116         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4117             c->idct_put= ff_wmv2_idct_put_c;
4118             c->idct_add= ff_wmv2_idct_add_c;
4119             c->idct    = ff_wmv2_idct_c;
4120             c->idct_permutation_type= FF_NO_IDCT_PERM;
4121         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4122             c->idct_put= ff_faanidct_put;
4123             c->idct_add= ff_faanidct_add;
4124             c->idct    = ff_faanidct;
4125             c->idct_permutation_type= FF_NO_IDCT_PERM;
4126         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4127             c->idct_put= ff_ea_idct_put_c;
4128             c->idct_permutation_type= FF_NO_IDCT_PERM;
4129         }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4130             c->idct     = ff_bink_idct_c;
4131             c->idct_add = ff_bink_idct_add_c;
4132             c->idct_put = ff_bink_idct_put_c;
4133             c->idct_permutation_type = FF_NO_IDCT_PERM;
4134         }else{ //accurate/default
4135             c->idct_put= ff_simple_idct_put;
4136             c->idct_add= ff_simple_idct_add;
4137             c->idct    = ff_simple_idct;
4138             c->idct_permutation_type= FF_NO_IDCT_PERM;
4139         }
4140     }
4141
4142     c->get_pixels = get_pixels_c;
4143     c->diff_pixels = diff_pixels_c;
4144     c->put_pixels_clamped = ff_put_pixels_clamped_c;
4145     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
4146     c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4147     c->add_pixels_clamped = ff_add_pixels_clamped_c;
4148     c->add_pixels8 = add_pixels8_c;
4149     c->add_pixels4 = add_pixels4_c;
4150     c->sum_abs_dctelem = sum_abs_dctelem_c;
4151     c->emulated_edge_mc = ff_emulated_edge_mc;
4152     c->gmc1 = gmc1_c;
4153     c->gmc = ff_gmc_c;
4154     c->clear_block = clear_block_c;
4155     c->clear_blocks = clear_blocks_c;
4156     c->pix_sum = pix_sum_c;
4157     c->pix_norm1 = pix_norm1_c;
4158
4159     c->fill_block_tab[0] = fill_block16_c;
4160     c->fill_block_tab[1] = fill_block8_c;
4161     c->scale_block = scale_block_c;
4162
4163     /* TODO [0] 16  [1] 8 */
4164     c->pix_abs[0][0] = pix_abs16_c;
4165     c->pix_abs[0][1] = pix_abs16_x2_c;
4166     c->pix_abs[0][2] = pix_abs16_y2_c;
4167     c->pix_abs[0][3] = pix_abs16_xy2_c;
4168     c->pix_abs[1][0] = pix_abs8_c;
4169     c->pix_abs[1][1] = pix_abs8_x2_c;
4170     c->pix_abs[1][2] = pix_abs8_y2_c;
4171     c->pix_abs[1][3] = pix_abs8_xy2_c;
4172
4173 #define dspfunc(PFX, IDX, NUM) \
4174     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4175     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4176     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4177     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4178
4179     dspfunc(put, 0, 16);
4180     dspfunc(put_no_rnd, 0, 16);
4181     dspfunc(put, 1, 8);
4182     dspfunc(put_no_rnd, 1, 8);
4183     dspfunc(put, 2, 4);
4184     dspfunc(put, 3, 2);
4185
4186     dspfunc(avg, 0, 16);
4187     dspfunc(avg_no_rnd, 0, 16);
4188     dspfunc(avg, 1, 8);
4189     dspfunc(avg_no_rnd, 1, 8);
4190     dspfunc(avg, 2, 4);
4191     dspfunc(avg, 3, 2);
4192 #undef dspfunc
4193
4194     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4195     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4196
4197     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4198     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4199     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4200     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4201     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4202     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4203     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4204     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4205     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4206
4207     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4208     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4209     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4210     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4211     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4212     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4213     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4214     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4215     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4216
4217 #define dspfunc(PFX, IDX, NUM) \
4218     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4219     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4220     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4221     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4222     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4223     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4224     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4225     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4226     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4227     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4228     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4229     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4230     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4231     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4232     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4233     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4234
4235     dspfunc(put_qpel, 0, 16);
4236     dspfunc(put_no_rnd_qpel, 0, 16);
4237
4238     dspfunc(avg_qpel, 0, 16);
4239     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4240
4241     dspfunc(put_qpel, 1, 8);
4242     dspfunc(put_no_rnd_qpel, 1, 8);
4243
4244     dspfunc(avg_qpel, 1, 8);
4245     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4246
4247     dspfunc(put_h264_qpel, 0, 16);
4248     dspfunc(put_h264_qpel, 1, 8);
4249     dspfunc(put_h264_qpel, 2, 4);
4250     dspfunc(put_h264_qpel, 3, 2);
4251     dspfunc(avg_h264_qpel, 0, 16);
4252     dspfunc(avg_h264_qpel, 1, 8);
4253     dspfunc(avg_h264_qpel, 2, 4);
4254
4255 #undef dspfunc
4256     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4257     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4258     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4259     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4260     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4261     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4262
4263     c->draw_edges = draw_edges_c;
4264
4265 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4266     ff_mlp_init(c, avctx);
4267 #endif
4268 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4269     ff_intrax8dsp_init(c,avctx);
4270 #endif
4271 #if CONFIG_RV30_DECODER
4272     ff_rv30dsp_init(c,avctx);
4273 #endif
4274 #if CONFIG_RV40_DECODER
4275     ff_rv40dsp_init(c,avctx);
4276     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4277     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4278     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4279     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4280 #endif
4281
4282     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4283     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4284     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4285     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4286     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4287     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4288     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4289     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4290
4291 #define SET_CMP_FUNC(name) \
4292     c->name[0]= name ## 16_c;\
4293     c->name[1]= name ## 8x8_c;
4294
4295     SET_CMP_FUNC(hadamard8_diff)
4296     c->hadamard8_diff[4]= hadamard8_intra16_c;
4297     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4298     SET_CMP_FUNC(dct_sad)
4299     SET_CMP_FUNC(dct_max)
4300 #if CONFIG_GPL
4301     SET_CMP_FUNC(dct264_sad)
4302 #endif
4303     c->sad[0]= pix_abs16_c;
4304     c->sad[1]= pix_abs8_c;
4305     c->sse[0]= sse16_c;
4306     c->sse[1]= sse8_c;
4307     c->sse[2]= sse4_c;
4308     SET_CMP_FUNC(quant_psnr)
4309     SET_CMP_FUNC(rd)
4310     SET_CMP_FUNC(bit)
4311     c->vsad[0]= vsad16_c;
4312     c->vsad[4]= vsad_intra16_c;
4313     c->vsad[5]= vsad_intra8_c;
4314     c->vsse[0]= vsse16_c;
4315     c->vsse[4]= vsse_intra16_c;
4316     c->vsse[5]= vsse_intra8_c;
4317     c->nsse[0]= nsse16_c;
4318     c->nsse[1]= nsse8_c;
4319 #if CONFIG_DWT
4320     ff_dsputil_init_dwt(c);
4321 #endif
4322
4323     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4324
4325     c->add_bytes= add_bytes_c;
4326     c->add_bytes_l2= add_bytes_l2_c;
4327     c->diff_bytes= diff_bytes_c;
4328     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4329     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4330     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
4331     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4332     c->bswap_buf= bswap_buf;
4333     c->bswap16_buf = bswap16_buf;
4334 #if CONFIG_PNG_DECODER
4335     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4336 #endif
4337
4338     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4339         c->h263_h_loop_filter= h263_h_loop_filter_c;
4340         c->h263_v_loop_filter= h263_v_loop_filter_c;
4341     }
4342
4343     if (CONFIG_VP3_DECODER) {
4344         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4345         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4346         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4347     }
4348
4349     c->h261_loop_filter= h261_loop_filter_c;
4350
4351     c->try_8x8basis= try_8x8basis_c;
4352     c->add_8x8basis= add_8x8basis_c;
4353
4354 #if CONFIG_VORBIS_DECODER
4355     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4356 #endif
4357 #if CONFIG_AC3_DECODER
4358     c->ac3_downmix = ff_ac3_downmix_c;
4359 #endif
4360     c->vector_fmul = vector_fmul_c;
4361     c->vector_fmul_reverse = vector_fmul_reverse_c;
4362     c->vector_fmul_add = vector_fmul_add_c;
4363     c->vector_fmul_window = vector_fmul_window_c;
4364     c->vector_clipf = vector_clipf_c;
4365     c->scalarproduct_int16 = scalarproduct_int16_c;
4366     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4367     c->scalarproduct_float = scalarproduct_float_c;
4368     c->butterflies_float = butterflies_float_c;
4369     c->vector_fmul_scalar = vector_fmul_scalar_c;
4370
4371     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4372     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4373
4374     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4375     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4376
4377     c->shrink[0]= av_image_copy_plane;
4378     c->shrink[1]= ff_shrink22;
4379     c->shrink[2]= ff_shrink44;
4380     c->shrink[3]= ff_shrink88;
4381
4382     c->prefetch= just_return;
4383
4384     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4385     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4386
4387     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4388     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4389     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4390     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4391     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4392     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4393     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4394     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4395     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4396
4397     for(i=0; i<64; i++){
4398         if(!c->put_2tap_qpel_pixels_tab[0][i])
4399             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4400         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4401             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4402     }
4403
4404     c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4405     c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4406     c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4407     c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4408
4409     c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4410     c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4411     c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4412     c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4413
4414     switch(c->idct_permutation_type){
4415     case FF_NO_IDCT_PERM:
4416         for(i=0; i<64; i++)
4417             c->idct_permutation[i]= i;
4418         break;
4419     case FF_LIBMPEG2_IDCT_PERM:
4420         for(i=0; i<64; i++)
4421             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4422         break;
4423     case FF_SIMPLE_IDCT_PERM:
4424         for(i=0; i<64; i++)
4425             c->idct_permutation[i]= simple_mmx_permutation[i];
4426         break;
4427     case FF_TRANSPOSE_IDCT_PERM:
4428         for(i=0; i<64; i++)
4429             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4430         break;
4431     case FF_PARTTRANS_IDCT_PERM:
4432         for(i=0; i<64; i++)
4433             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4434         break;
4435     case FF_SSE2_IDCT_PERM:
4436         for(i=0; i<64; i++)
4437             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4438         break;
4439     default:
4440         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4441     }
4442 }
4443